diff --git a/.github/codecov.yml b/.github/codecov.yml index ca879ab64..57c4bb160 100644 --- a/.github/codecov.yml +++ b/.github/codecov.yml @@ -5,19 +5,14 @@ coverage: status: project: default: - threshold: 1% - patch: on + threshold: 20% + patch: off changes: off github_checks: annotations: false -comment: - layout: "diff, files" - behavior: default - require_changes: no - require_base: no - require_head: yes +comment: false ignore: - "docs" @@ -25,3 +20,6 @@ ignore: - "scripts" - "**/*.pb.go" - "libs/pubsub/query/query.peg.go" + - "*.md" + - "*.rst" + - "*.yml" diff --git a/.github/workflows/coverage.yml b/.github/workflows/coverage.yml index 7d312b4f8..6ac3a738e 100644 --- a/.github/workflows/coverage.yml +++ b/.github/workflows/coverage.yml @@ -2,6 +2,9 @@ name: Test Coverage on: pull_request: push: + paths: + - "**.go" + - "!test/" branches: - master - release/** @@ -50,6 +53,7 @@ jobs: with: PATTERNS: | **/**.go + "!test/" go.mod go.sum - name: install @@ -72,6 +76,7 @@ jobs: with: PATTERNS: | **/**.go + "!test/" go.mod go.sum - uses: actions/download-artifact@v2 @@ -100,6 +105,7 @@ jobs: with: PATTERNS: | **/**.go + "!test/" go.mod go.sum - uses: actions/download-artifact@v2 @@ -121,7 +127,7 @@ jobs: - run: | cat ./*profile.out | grep -v "mode: atomic" >> coverage.txt if: env.GIT_DIFF - - uses: codecov/codecov-action@v2.0.3 + - uses: codecov/codecov-action@v2.1.0 with: file: ./coverage.txt if: env.GIT_DIFF diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index 89797a581..e773526fd 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -40,7 +40,7 @@ jobs: platforms: all - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v1.5.0 + uses: docker/setup-buildx-action@v1.6.0 - name: Login to DockerHub if: ${{ github.event_name != 'pull_request' }} diff --git a/.github/workflows/e2e-nightly-master.yml b/.github/workflows/e2e-nightly-master.yml index d6d459abc..029fee6bb 100644 --- a/.github/workflows/e2e-nightly-master.yml +++ b/.github/workflows/e2e-nightly-master.yml @@ -30,7 +30,7 @@ jobs: - name: Build working-directory: test/e2e # Run make jobs in parallel, since we can't run steps in parallel. - run: make -j2 docker generator runner + run: make -j2 docker generator runner tests - name: Generate testnets working-directory: test/e2e diff --git a/.github/workflows/e2e.yml b/.github/workflows/e2e.yml index dd2b44da3..5cc605ead 100644 --- a/.github/workflows/e2e.yml +++ b/.github/workflows/e2e.yml @@ -28,7 +28,7 @@ jobs: - name: Build working-directory: test/e2e # Run two make jobs in parallel, since we can't run steps in parallel. - run: make -j2 docker runner + run: make -j2 docker runner tests if: "env.GIT_DIFF != ''" - name: Run CI testnet diff --git a/.github/workflows/proto-docker.yml b/.github/workflows/proto-docker.yml index ed31025b9..ee26bd111 100644 --- a/.github/workflows/proto-docker.yml +++ b/.github/workflows/proto-docker.yml @@ -34,7 +34,7 @@ jobs: echo ::set-output name=tags::${TAGS} - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v1.5.0 + uses: docker/setup-buildx-action@v1.6.0 - name: Login to DockerHub uses: docker/login-action@v1.10.0 diff --git a/.golangci.yml b/.golangci.yml index 574ed22b0..b62f926e2 100644 --- a/.golangci.yml +++ b/.golangci.yml @@ -1,14 +1,17 @@ linters: enable: + - asciicheck - bodyclose - deadcode - depguard - dogsled - dupl - errcheck + - exportloopref # - funlen # - gochecknoglobals # - gochecknoinits + # - gocognit - goconst - gocritic # - gocyclo @@ -22,11 +25,11 @@ linters: - ineffassign # - interfacer - lll - - misspell # - maligned + - misspell - nakedret + - nolintlint - prealloc - - exportloopref - staticcheck - structcheck - stylecheck @@ -37,9 +40,6 @@ linters: - varcheck # - whitespace # - wsl - # - gocognit - - nolintlint - - asciicheck issues: exclude-rules: diff --git a/CHANGELOG.md b/CHANGELOG.md index df18653c5..5d4f3d278 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,173 @@ # Changelog -Friendly reminder, we have a [bug bounty program](https://hackerone.com/tendermint). +Friendly reminder: We have a [bug bounty program](https://hackerone.com/tendermint). + +## v0.35 + +Special thanks to external contributors on this release: @JayT106, @bipulprasad, @alessio, @Yawning, @silasdavis, +@cuonglm, @tanyabouman, @JoeKash, @githubsands, @jeebster, @crypto-facs, @liamsi, and @gotjoshua + +### BREAKING CHANGES + +- CLI/RPC/Config + - [pubsub/events] \#6634 The `ResultEvent.Events` field is now of type `[]abci.Event` preserving event order instead of `map[string][]string`. (@alexanderbez) + - [config] \#5598 The `test_fuzz` and `test_fuzz_config` P2P settings have been removed. (@erikgrinaker) + - [config] \#5728 `fastsync.version = "v1"` is no longer supported (@melekes) + - [cli] \#5772 `gen_node_key` prints JSON-encoded `NodeKey` rather than ID and does not save it to `node_key.json` (@melekes) + - [cli] \#5777 use hyphen-case instead of snake_case for all cli commands and config parameters (@cmwaters) + - [rpc] \#6019 standardise RPC errors and return the correct status code (@bipulprasad & @cmwaters) + - [rpc] \#6168 Change default sorting to desc for `/tx_search` results (@melekes) + - [cli] \#6282 User must specify the node mode when using `tendermint init` (@cmwaters) + - [state/indexer] \#6382 reconstruct indexer, move txindex into the indexer package (@JayT106) + - [cli] \#6372 Introduce `BootstrapPeers` as part of the new p2p stack. Peers to be connected on startup (@cmwaters) + - [config] \#6462 Move `PrivValidator` configuration out of `BaseConfig` into its own section. (@tychoish) + - [rpc] \#6610 Add MaxPeerBlockHeight into /status rpc call (@JayT106) + - [blocksync/rpc] \#6620 Add TotalSyncedTime & RemainingTime to SyncInfo in /status RPC (@JayT106) + - [rpc/grpc] \#6725 Mark gRPC in the RPC layer as deprecated. + - [blocksync/v2] \#6730 Fast Sync v2 is deprecated, please use v0 + - [rpc] Add genesis_chunked method to support paginated and parallel fetching of large genesis documents. + - [rpc/jsonrpc/server] \#6785 `Listen` function updated to take an `int` argument, `maxOpenConnections`, instead of an entire config object. (@williambanfield) + - [rpc] \#6820 Update RPC methods to reflect changes in the p2p layer, disabling support for `UnsafeDialPeers` and `UnsafeDialPeers` when used with the new p2p layer, and changing the response format of the peer list in `NetInfo` for all users. + - [cli] \#6854 Remove deprecated snake case commands. (@tychoish) + +- Apps + - [ABCI] \#6408 Change the `key` and `value` fields from `[]byte` to `string` in the `EventAttribute` type. (@alexanderbez) + - [ABCI] \#5447 Remove `SetOption` method from `ABCI.Client` interface + - [ABCI] \#5447 Reset `Oneof` indexes for `Request` and `Response`. + - [ABCI] \#5818 Use protoio for msg length delimitation. Migrates from int64 to uint64 length delimiters. + - [ABCI] \#3546 Add `mempool_error` field to `ResponseCheckTx`. This field will contain an error string if Tendermint encountered an error while adding a transaction to the mempool. (@williambanfield) + - [Version] \#6494 `TMCoreSemVer` has been renamed to `TMVersion`. + - It is not required any longer to set ldflags to set version strings + - [abci/counter] \#6684 Delete counter example app + +- Go API + - [pubsub] \#6634 The `Query#Matches` method along with other pubsub methods, now accepts a `[]abci.Event` instead of `map[string][]string`. (@alexanderbez) + - [p2p] \#6618 \#6583 Move `p2p.NodeInfo`, `p2p.NodeID` and `p2p.NetAddress` into `types` to support use in external packages. (@tychoish) + - [node] \#6540 Reduce surface area of the `node` package by making most of the implementation details private. (@tychoish) + - [p2p] \#6547 Move the entire `p2p` package and all reactor implementations into `internal`. (@tychoish) + - [libs/log] \#6534 Remove the existing custom Tendermint logger backed by go-kit. The logging interface, `Logger`, remains. Tendermint still provides a default logger backed by the performant zerolog logger. (@alexanderbez) + - [libs/time] \#6495 Move types/time to libs/time to improve consistency. (@tychoish) + - [mempool] \#6529 The `Context` field has been removed from the `TxInfo` type. `CheckTx` now requires a `Context` argument. (@alexanderbez) + - [abci/client, proxy] \#5673 `Async` funcs return an error, `Sync` and `Async` funcs accept `context.Context` (@melekes) + - [p2p] Remove unused function `MakePoWTarget`. (@erikgrinaker) + - [libs/bits] \#5720 Validate `BitArray` in `FromProto`, which now returns an error (@melekes) + - [proto/p2p] Rename `DefaultNodeInfo` and `DefaultNodeInfoOther` to `NodeInfo` and `NodeInfoOther` (@erikgrinaker) + - [proto/p2p] Rename `NodeInfo.default_node_id` to `node_id` (@erikgrinaker) + - [libs/os] Kill() and {Must,}{Read,Write}File() functions have been removed. (@alessio) + - [store] \#5848 Remove block store state in favor of using the db iterators directly (@cmwaters) + - [state] \#5864 Use an iterator when pruning state (@cmwaters) + - [types] \#6023 Remove `tm2pb.Header`, `tm2pb.BlockID`, `tm2pb.PartSetHeader` and `tm2pb.NewValidatorUpdate`. + - Each of the above types has a `ToProto` and `FromProto` method or function which replaced this logic. + - [light] \#6054 Move `MaxRetryAttempt` option from client to provider. + - `NewWithOptions` now sets the max retry attempts and timeouts (@cmwaters) + - [all] \#6077 Change spelling from British English to American (@cmwaters) + - Rename "Subscription.Cancelled()" to "Subscription.Canceled()" in libs/pubsub + - Rename "behaviour" pkg to "behavior" and internalized it in blocksync v2 + - [rpc/client/http] \#6176 Remove `endpoint` arg from `New`, `NewWithTimeout` and `NewWithClient` (@melekes) + - [rpc/client/http] \#6176 Unexpose `WSEvents` (@melekes) + - [rpc/jsonrpc/client/ws_client] \#6176 `NewWS` no longer accepts options (use `NewWSWithOptions` and `OnReconnect` funcs to configure the client) (@melekes) + - [internal/libs] \#6366 Move `autofile`, `clist`,`fail`,`flowrate`, `protoio`, `sync`, `tempfile`, `test` and `timer` lib packages to an internal folder + - [libs/rand] \#6364 Remove most of libs/rand in favour of standard lib's `math/rand` (@liamsi) + - [mempool] \#6466 The original mempool reactor has been versioned as `v0` and moved to a sub-package under the root `mempool` package. + Some core types have been kept in the `mempool` package such as `TxCache` and it's implementations, the `Mempool` interface itself + and `TxInfo`. (@alexanderbez) + - [crypto/sr25519] \#6526 Do not re-execute the Ed25519-style key derivation step when doing signing and verification. The derivation is now done once and only once. This breaks `sr25519.GenPrivKeyFromSecret` output compatibility. (@Yawning) + - [types] \#6627 Move `NodeKey` to types to make the type public. + - [config] \#6627 Extend `config` to contain methods `LoadNodeKeyID` and `LoadorGenNodeKeyID` + - [blocksync] \#6755 Rename `FastSync` and `Blockchain` package to `BlockSync` (@cmwaters) + +- Data Storage + - [store/state/evidence/light] \#5771 Use an order-preserving varint key encoding (@cmwaters) + - [mempool] \#6396 Remove mempool's write ahead log (WAL), (previously unused by the tendermint code). (@tychoish) + - [state] \#6541 Move pruneBlocks from consensus/state to state/execution. (@JayT106) + +- Tooling + - [tools] \#6498 Set OS home dir to instead of the hardcoded PATH. (@JayT106) + - [cli/indexer] \#6676 Reindex events command line tooling. (@JayT106) + +### FEATURES + +- [config] Add `--mode` flag and config variable. See [ADR-52](https://github.com/tendermint/tendermint/blob/master/docs/architecture/adr-052-tendermint-mode.md) @dongsam +- [rpc] \#6329 Don't cap page size in unsafe mode (@gotjoshua, @cmwaters) +- [pex] \#6305 v2 pex reactor with backwards compatability. Introduces two new pex messages to + accomodate for the new p2p stack. Removes the notion of seeds and crawling. All peer + exchange reactors behave the same. (@cmwaters) +- [crypto] \#6376 Enable sr25519 as a validator key type +- [mempool] \#6466 Introduction of a prioritized mempool. (@alexanderbez) + - `Priority` and `Sender` have been introduced into the `ResponseCheckTx` type, where the `priority` will determine the prioritization of + the transaction when a proposer reaps transactions for a block proposal. The `sender` field acts as an index. + - Operators may toggle between the legacy mempool reactor, `v0`, and the new prioritized reactor, `v1`, by setting the + `mempool.version` configuration, where `v1` is the default configuration. + - Applications that do not specify a priority, i.e. zero, will have transactions reaped by the order in which they are received by the node. + - Transactions are gossiped in FIFO order as they are in `v0`. +- [config/indexer] \#6411 Introduce support for custom event indexing data sources, specifically PostgreSQL. (@JayT106) +- [blocksync/event] \#6619 Emit blocksync status event when switching consensus/blocksync (@JayT106) +- [statesync/event] \#6700 Emit statesync status start/end event (@JayT106) +- [inspect] \#6785 Add a new `inspect` command for introspecting the state and block store of a crashed tendermint node. (@williambanfield) + +### IMPROVEMENTS + +- [libs/log] Console log formatting changes as a result of \#6534 and \#6589. (@tychoish) +- [statesync] \#6566 Allow state sync fetchers and request timeout to be configurable. (@alexanderbez) +- [types] \#6478 Add `block_id` to `newblock` event (@jeebster) +- [crypto/ed25519] \#5632 Adopt zip215 `ed25519` verification. (@marbar3778) +- [crypto/ed25519] \#6526 Use [curve25519-voi](https://github.com/oasisprotocol/curve25519-voi) for `ed25519` signing and verification. (@Yawning) +- [crypto/sr25519] \#6526 Use [curve25519-voi](https://github.com/oasisprotocol/curve25519-voi) for `sr25519` signing and verification. (@Yawning) +- [privval] \#5603 Add `--key` to `init`, `gen_validator`, `testnet` & `unsafe_reset_priv_validator` for use in generating `secp256k1` keys. +- [privval] \#5725 Add gRPC support to private validator. +- [privval] \#5876 `tendermint show-validator` will query the remote signer if gRPC is being used (@marbar3778) +- [abci/client] \#5673 `Async` requests return an error if queue is full (@melekes) +- [mempool] \#5673 Cancel `CheckTx` requests if RPC client disconnects or times out (@melekes) +- [abci] \#5706 Added `AbciVersion` to `RequestInfo` allowing applications to check ABCI version when connecting to Tendermint. (@marbar3778) +- [blocksync/v1] \#5728 Remove blocksync v1 (@melekes) +- [blocksync/v0] \#5741 Relax termination conditions and increase sync timeout (@melekes) +- [cli] \#5772 `gen_node_key` output now contains node ID (`id` field) (@melekes) +- [blocksync/v2] \#5774 Send status request when new peer joins (@melekes) +- [store] \#5888 store.SaveBlock saves using batches instead of transactions for now to improve ACID properties. This is a quick fix for underlying issues around tm-db and ACID guarantees. (@githubsands) +- [consensus] \#5987 and \#5792 Remove the `time_iota_ms` consensus parameter. Merge `tmproto.ConsensusParams` and `abci.ConsensusParams`. (@marbar3778, @valardragon) +- [types] \#5994 Reduce the use of protobuf types in core logic. (@marbar3778) + - `ConsensusParams`, `BlockParams`, `ValidatorParams`, `EvidenceParams`, `VersionParams`, `sm.Version` and `version.Consensus` have become native types. They still utilize protobuf when being sent over the wire or written to disk. +- [rpc/client/http] \#6163 Do not drop events even if the `out` channel is full (@melekes) +- [node] \#6059 Validate and complete genesis doc before saving to state store (@silasdavis) +- [state] \#6067 Batch save state data (@githubsands & @cmwaters) +- [crypto] \#6120 Implement batch verification interface for ed25519 and sr25519. (@marbar3778) +- [types] \#6120 use batch verification for verifying commits signatures. + - If the key type supports the batch verification API it will try to batch verify. If the verification fails we will single verify each signature. +- [privval/file] \#6185 Return error on `LoadFilePV`, `LoadFilePVEmptyState`. Allows for better programmatic control of Tendermint. +- [privval] \#6240 Add `context.Context` to privval interface. +- [rpc] \#6265 set cache control in http-rpc response header (@JayT106) +- [statesync] \#6378 Retry requests for snapshots and add a minimum discovery time (5s) for new snapshots. +- [node/state] \#6370 graceful shutdown in the consensus reactor (@JayT106) +- [crypto/merkle] \#6443 Improve HashAlternatives performance (@cuonglm) +- [crypto/merkle] \#6513 Optimize HashAlternatives (@marbar3778) +- [p2p/pex] \#6509 Improve addrBook.hash performance (@cuonglm) +- [consensus/metrics] \#6549 Change block_size gauge to a histogram for better observability over time (@marbar3778) +- [statesync] \#6587 Increase chunk priority and re-request chunks that don't arrive (@cmwaters) +- [state/privval] \#6578 No GetPubKey retry beyond the proposal/voting window (@JayT106) +- [rpc] \#6615 Add TotalGasUsed to block_results response (@crypto-facs) +- [cmd/tendermint/commands] \#6623 replace `$HOME/.some/test/dir` with `t.TempDir` (@tanyabouman) +- [statesync] \6807 Implement P2P state provider as an alternative to RPC (@cmwaters) + +### BUG FIXES + +- [privval] \#5638 Increase read/write timeout to 5s and calculate ping interval based on it (@JoeKash) +- [evidence] \#6375 Fix bug with inconsistent LightClientAttackEvidence hashing (cmwaters) +- [rpc] \#6507 Ensure RPC client can handle URLs without ports (@JayT106) +- [statesync] \#6463 Adds Reverse Sync feature to fetch historical light blocks after state sync in order to verify any evidence (@cmwaters) +- [blocksync] \#6590 Update the metrics during blocksync (@JayT106) + +## v0.34.13 + +*September 6, 2021* + +This release backports improvements to state synchronization and ABCI +performance under concurrent load, and the PostgreSQL event indexer. + +### IMPROVEMENTS + +- [statesync] [\#6881](https://github.com/tendermint/tendermint/issues/6881) improvements to stateprovider logic (@cmwaters) +- [ABCI] [\#6873](https://github.com/tendermint/tendermint/issues/6873) change client to use multi-reader mutexes (@tychoish) +- [indexing] [\#6906](https://github.com/tendermint/tendermint/issues/6906) enable the PostgreSQL indexer sink (@creachadair) ## v0.34.12 diff --git a/CHANGELOG_PENDING.md b/CHANGELOG_PENDING.md index 6c25ef89e..af278d3fb 100644 --- a/CHANGELOG_PENDING.md +++ b/CHANGELOG_PENDING.md @@ -9,156 +9,18 @@ Friendly reminder: We have a [bug bounty program](https://hackerone.com/tendermi ### BREAKING CHANGES - CLI/RPC/Config - - [pubsub/events] \#6634 The `ResultEvent.Events` field is now of type `[]abci.Event` preserving event order instead of `map[string][]string`. (@alexanderbez) - - [config] \#5598 The `test_fuzz` and `test_fuzz_config` P2P settings have been removed. (@erikgrinaker) - - [config] \#5728 `fast_sync = "v1"` is no longer supported (@melekes) - - [cli] \#5772 `gen_node_key` prints JSON-encoded `NodeKey` rather than ID and does not save it to `node_key.json` (@melekes) - - [cli] \#5777 use hyphen-case instead of snake_case for all cli commands and config parameters (@cmwaters) - - [rpc] \#6019 standardise RPC errors and return the correct status code (@bipulprasad & @cmwaters) - - [rpc] \#6168 Change default sorting to desc for `/tx_search` results (@melekes) - - [cli] \#6282 User must specify the node mode when using `tendermint init` (@cmwaters) - - [state/indexer] \#6382 reconstruct indexer, move txindex into the indexer package (@JayT106) - - [cli] \#6372 Introduce `BootstrapPeers` as part of the new p2p stack. Peers to be connected on startup (@cmwaters) - - [config] \#6462 Move `PrivValidator` configuration out of `BaseConfig` into its own section. (@tychoish) - - [rpc] \#6610 Add MaxPeerBlockHeight into /status rpc call (@JayT106) - - [fastsync/rpc] \#6620 Add TotalSyncedTime & RemainingTime to SyncInfo in /status RPC (@JayT106) - - [rpc/grpc] \#6725 Mark gRPC in the RPC layer as deprecated. - - [blockchain/v2] \#6730 Fast Sync v2 is deprecated, please use v0 - - [rpc] Add genesis_chunked method to support paginated and parallel fetching of large genesis documents. - - [rpc/jsonrpc/server] \#6785 `Listen` function updated to take an `int` argument, `maxOpenConnections`, instead of an entire config object. (@williambanfield) - - [rpc] \#6820 Update RPC methods to reflect changes in the p2p layer, disabling support for `UnsafeDialPeers` and `UnsafeDialPeers` when used with the new p2p layer, and changing the response format of the peer list in `NetInfo` for all users. - - [cli] \#6854 Remove deprecated snake case commands. (@tychoish) + - Apps - - [ABCI] \#6408 Change the `key` and `value` fields from `[]byte` to `string` in the `EventAttribute` type. (@alexanderbez) - - [ABCI] \#5447 Remove `SetOption` method from `ABCI.Client` interface - - [ABCI] \#5447 Reset `Oneof` indexes for `Request` and `Response`. - - [ABCI] \#5818 Use protoio for msg length delimitation. Migrates from int64 to uint64 length delimiters. - - [ABCI] \#3546 Add `mempool_error` field to `ResponseCheckTx`. This field will contain an error string if Tendermint encountered an error while adding a transaction to the mempool. (@williambanfield) - - [Version] \#6494 `TMCoreSemVer` has been renamed to `TMVersion`. - - It is not required any longer to set ldflags to set version strings - - [abci/counter] \#6684 Delete counter example app - P2P Protocol - Go API - - [pubsub] \#6634 The `Query#Matches` method along with other pubsub methods, now accepts a `[]abci.Event` instead of `map[string][]string`. (@alexanderbez) - - [p2p] \#6618 Move `p2p.NodeInfo` into `types` to support use of the SDK. (@tychoish) - - [p2p] \#6583 Make `p2p.NodeID` and `p2p.NetAddress` exported types to support their use in the RPC layer. (@tychoish) - - [node] \#6540 Reduce surface area of the `node` package by making most of the implementation details private. (@tychoish) - - [p2p] \#6547 Move the entire `p2p` package and all reactor implementations into `internal`. (@tychoish) - - [libs/log] \#6534 Remove the existing custom Tendermint logger backed by go-kit. The logging interface, `Logger`, remains. Tendermint still provides a default logger backed by the performant zerolog logger. (@alexanderbez) - - [libs/time] \#6495 Move types/time to libs/time to improve consistency. (@tychoish) - - [mempool] \#6529 The `Context` field has been removed from the `TxInfo` type. `CheckTx` now requires a `Context` argument. (@alexanderbez) - - [abci/client, proxy] \#5673 `Async` funcs return an error, `Sync` and `Async` funcs accept `context.Context` (@melekes) - - [p2p] Remove unused function `MakePoWTarget`. (@erikgrinaker) - - [libs/bits] \#5720 Validate `BitArray` in `FromProto`, which now returns an error (@melekes) - - [proto/p2p] Rename `DefaultNodeInfo` and `DefaultNodeInfoOther` to `NodeInfo` and `NodeInfoOther` (@erikgrinaker) - - [proto/p2p] Rename `NodeInfo.default_node_id` to `node_id` (@erikgrinaker) - - [libs/os] Kill() and {Must,}{Read,Write}File() functions have been removed. (@alessio) - - [store] \#5848 Remove block store state in favor of using the db iterators directly (@cmwaters) - - [state] \#5864 Use an iterator when pruning state (@cmwaters) - - [types] \#6023 Remove `tm2pb.Header`, `tm2pb.BlockID`, `tm2pb.PartSetHeader` and `tm2pb.NewValidatorUpdate`. - - Each of the above types has a `ToProto` and `FromProto` method or function which replaced this logic. - - [light] \#6054 Move `MaxRetryAttempt` option from client to provider. - - `NewWithOptions` now sets the max retry attempts and timeouts (@cmwaters) - - [all] \#6077 Change spelling from British English to American (@cmwaters) - - Rename "Subscription.Cancelled()" to "Subscription.Canceled()" in libs/pubsub - - Rename "behaviour" pkg to "behavior" and internalized it in blockchain v2 - - [rpc/client/http] \#6176 Remove `endpoint` arg from `New`, `NewWithTimeout` and `NewWithClient` (@melekes) - - [rpc/client/http] \#6176 Unexpose `WSEvents` (@melekes) - - [rpc/jsonrpc/client/ws_client] \#6176 `NewWS` no longer accepts options (use `NewWSWithOptions` and `OnReconnect` funcs to configure the client) (@melekes) - - [internal/libs] \#6366 Move `autofile`, `clist`,`fail`,`flowrate`, `protoio`, `sync`, `tempfile`, `test` and `timer` lib packages to an internal folder - - [libs/rand] \#6364 Remove most of libs/rand in favour of standard lib's `math/rand` (@liamsi) - - [mempool] \#6466 The original mempool reactor has been versioned as `v0` and moved to a sub-package under the root `mempool` package. - Some core types have been kept in the `mempool` package such as `TxCache` and it's implementations, the `Mempool` interface itself - and `TxInfo`. (@alexanderbez) - - [crypto/sr25519] \#6526 Do not re-execute the Ed25519-style key derivation step when doing signing and verification. The derivation is now done once and only once. This breaks `sr25519.GenPrivKeyFromSecret` output compatibility. (@Yawning) - - [types] \#6627 Move `NodeKey` to types to make the type public. - - [config] \#6627 Extend `config` to contain methods `LoadNodeKeyID` and `LoadorGenNodeKeyID` - - [blocksync] \#6755 Rename `FastSync` and `Blockchain` package to `BlockSync` - (@cmwaters) - Blockchain Protocol -- Data Storage - - [store/state/evidence/light] \#5771 Use an order-preserving varint key encoding (@cmwaters) - - [mempool] \#6396 Remove mempool's write ahead log (WAL), (previously unused by the tendermint code). (@tychoish) - - [state] \#6541 Move pruneBlocks from consensus/state to state/execution. (@JayT106) - -- Tooling - - [tools] \#6498 Set OS home dir to instead of the hardcoded PATH. (@JayT106) - - [cli/indexer] \#6676 Reindex events command line tooling. (@JayT106) - ### FEATURES -- [config] Add `--mode` flag and config variable. See [ADR-52](https://github.com/tendermint/tendermint/blob/master/docs/architecture/adr-052-tendermint-mode.md) @dongsam -- [rpc] \#6329 Don't cap page size in unsafe mode (@gotjoshua, @cmwaters) -- [pex] \#6305 v2 pex reactor with backwards compatability. Introduces two new pex messages to - accomodate for the new p2p stack. Removes the notion of seeds and crawling. All peer - exchange reactors behave the same. (@cmwaters) -- [crypto] \#6376 Enable sr25519 as a validator key -- [mempool] \#6466 Introduction of a prioritized mempool. (@alexanderbez) - - `Priority` and `Sender` have been introduced into the `ResponseCheckTx` type, where the `priority` will determine the prioritization of - the transaction when a proposer reaps transactions for a block proposal. The `sender` field acts as an index. - - Operators may toggle between the legacy mempool reactor, `v0`, and the new prioritized reactor, `v1`, by setting the - `mempool.version` configuration, where `v1` is the default configuration. - - Applications that do not specify a priority, i.e. zero, will have transactions reaped by the order in which they are received by the node. - - Transactions are gossiped in FIFO order as they are in `v0`. -- [config/indexer] \#6411 Introduce support for custom event indexing data sources, specifically PostgreSQL. (@JayT106) -- [fastsync/event] \#6619 Emit fastsync status event when switching consensus/fastsync (@JayT106) -- [statesync/event] \#6700 Emit statesync status start/end event (@JayT106) -- [inspect] \#6785 Add a new `inspect` command for introspecting the state and block store of a crashed tendermint node. (@williambanfield) - ### IMPROVEMENTS -- [libs/log] Console log formatting changes as a result of \#6534 and \#6589. (@tychoish) -- [statesync] \#6566 Allow state sync fetchers and request timeout to be configurable. (@alexanderbez) -- [types] \#6478 Add `block_id` to `newblock` event (@jeebster) -- [crypto/ed25519] \#5632 Adopt zip215 `ed25519` verification. (@marbar3778) -- [crypto/ed25519] \#6526 Use [curve25519-voi](https://github.com/oasisprotocol/curve25519-voi) for `ed25519` signing and verification. (@Yawning) -- [crypto/sr25519] \#6526 Use [curve25519-voi](https://github.com/oasisprotocol/curve25519-voi) for `sr25519` signing and verification. (@Yawning) -- [privval] \#5603 Add `--key` to `init`, `gen_validator`, `testnet` & `unsafe_reset_priv_validator` for use in generating `secp256k1` keys. -- [privval] \#5725 Add gRPC support to private validator. -- [privval] \#5876 `tendermint show-validator` will query the remote signer if gRPC is being used (@marbar3778) -- [abci/client] \#5673 `Async` requests return an error if queue is full (@melekes) -- [mempool] \#5673 Cancel `CheckTx` requests if RPC client disconnects or times out (@melekes) -- [abci] \#5706 Added `AbciVersion` to `RequestInfo` allowing applications to check ABCI version when connecting to Tendermint. (@marbar3778) -- [blockchain/v1] \#5728 Remove in favor of v2 (@melekes) -- [blockchain/v0] \#5741 Relax termination conditions and increase sync timeout (@melekes) -- [cli] \#5772 `gen_node_key` output now contains node ID (`id` field) (@melekes) -- [blockchain/v2] \#5774 Send status request when new peer joins (@melekes) -- [consensus] \#5792 Deprecates the `time_iota_ms` consensus parameter, to reduce the bug surface. The parameter is no longer used. (@valardragon) -- [store] \#5888 store.SaveBlock saves using batches instead of transactions for now to improve ACID properties. This is a quick fix for underlying issues around tm-db and ACID guarantees. (@githubsands) -- [consensus] \#5987 Remove `time_iota_ms` from consensus params. Merge `tmproto.ConsensusParams` and `abci.ConsensusParams`. (@marbar3778) -- [types] \#5994 Reduce the use of protobuf types in core logic. (@marbar3778) - - `ConsensusParams`, `BlockParams`, `ValidatorParams`, `EvidenceParams`, `VersionParams`, `sm.Version` and `version.Consensus` have become native types. They still utilize protobuf when being sent over the wire or written to disk. -- [rpc/client/http] \#6163 Do not drop events even if the `out` channel is full (@melekes) -- [node] \#6059 Validate and complete genesis doc before saving to state store (@silasdavis) -- [state] \#6067 Batch save state data (@githubsands & @cmwaters) -- [crypto] \#6120 Implement batch verification interface for ed25519 and sr25519. (@marbar3778) -- [types] \#6120 use batch verification for verifying commits signatures. - - If the key type supports the batch verification API it will try to batch verify. If the verification fails we will single verify each signature. -- [privval/file] \#6185 Return error on `LoadFilePV`, `LoadFilePVEmptyState`. Allows for better programmatic control of Tendermint. -- [privval] \#6240 Add `context.Context` to privval interface. -- [rpc] \#6265 set cache control in http-rpc response header (@JayT106) -- [statesync] \#6378 Retry requests for snapshots and add a minimum discovery time (5s) for new snapshots. -- [node/state] \#6370 graceful shutdown in the consensus reactor (@JayT106) -- [crypto/merkle] \#6443 Improve HashAlternatives performance (@cuonglm) -- [crypto/merkle] \#6513 Optimize HashAlternatives (@marbar3778) -- [p2p/pex] \#6509 Improve addrBook.hash performance (@cuonglm) -- [consensus/metrics] \#6549 Change block_size gauge to a histogram for better observability over time (@marbar3778) -- [statesync] \#6587 Increase chunk priority and re-request chunks that don't arrive (@cmwaters) -- [state/privval] \#6578 No GetPubKey retry beyond the proposal/voting window (@JayT106) -- [rpc] \#6615 Add TotalGasUsed to block_results response (@crypto-facs) -- [cmd/tendermint/commands] \#6623 replace `$HOME/.some/test/dir` with `t.TempDir` (@tanyabouman) - ### BUG FIXES -- [privval] \#5638 Increase read/write timeout to 5s and calculate ping interval based on it (@JoeKash) -- [blockchain/v1] [\#5701](https://github.com/tendermint/tendermint/pull/5701) Handle peers without blocks (@melekes) -- [blockchain/v1] \#5711 Fix deadlock (@melekes) -- [evidence] \#6375 Fix bug with inconsistent LightClientAttackEvidence hashing (cmwaters) -- [rpc] \#6507 Ensure RPC client can handle URLs without ports (@JayT106) -- [statesync] \#6463 Adds Reverse Sync feature to fetch historical light blocks after state sync in order to verify any evidence (@cmwaters) -- [fastsync] \#6590 Update the metrics during fast-sync (@JayT106) -- [gitignore] \#6668 Fix gitignore of abci-cli (@tanyabouman) diff --git a/README.md b/README.md index d1e1df6dd..5d62a2f23 100644 --- a/README.md +++ b/README.md @@ -82,32 +82,12 @@ and familiarize yourself with our Tendermint uses [Semantic Versioning](http://semver.org/) to determine when and how the version changes. According to SemVer, anything in the public API can change at any time before version 1.0.0 -To provide some stability to Tendermint users in these 0.X.X days, the MINOR version is used -to signal breaking changes across a subset of the total public API. This subset includes all -interfaces exposed to other processes (cli, rpc, p2p, etc.), but does not -include the Go APIs. +To provide some stability to users of 0.X.X versions of Tendermint, the MINOR version is used +to signal breaking changes across Tendermint's API. This API includes all +publicly exposed types, functions, and methods in non-internal Go packages as well as +the types and methods accessible via the Tendermint RPC interface. -That said, breaking changes in the following packages will be documented in the -CHANGELOG even if they don't lead to MINOR version bumps: - -- crypto -- config -- libs - - bits - - bytes - - json - - log - - math - - net - - os - - protoio - - rand - - sync - - strings - - service -- node -- rpc/client -- types +Breaking changes to these public APIs will be documented in the CHANGELOG. ### Upgrades diff --git a/UPGRADING.md b/UPGRADING.md index 9d1a426ea..99efdf225 100644 --- a/UPGRADING.md +++ b/UPGRADING.md @@ -2,7 +2,7 @@ This guide provides instructions for upgrading to specific versions of Tendermint Core. -## Unreleased +## v0.35 ### ABCI Changes @@ -17,7 +17,16 @@ This guide provides instructions for upgrading to specific versions of Tendermin ### Config Changes -* `fast_sync = "v1"` and `fast_sync = "v2"` are no longer supported. Please use `v0` instead. +* The configuration file field `[fastsync]` has been renamed to `[blocksync]`. + +* The top level configuration file field `fast-sync` has moved under the new `[blocksync]` + field as `blocksync.enable`. + +* `blocksync.version = "v1"` and `blocksync.version = "v2"` (previously `fastsync`) + are no longer supported. Please use `v0` instead. During the v0.35 release cycle, `v0` was + determined to suit the existing needs and the cost of maintaining the `v1` and `v2` modules + was determined to be greater than necessary. + * All config parameters are now hyphen-case (also known as kebab-case) instead of snake_case. Before restarting the node make sure you have updated all the variables in your `config.toml` file. @@ -35,7 +44,7 @@ This guide provides instructions for upgrading to specific versions of Tendermin * The fast sync process as well as the blockchain package and service has all been renamed to block sync -### Key Format Changes +### Database Key Format Changes The format of all tendermint on-disk database keys changes in 0.35. Upgrading nodes must either re-sync all data or run a migration @@ -60,6 +69,8 @@ if needed. * You must now specify the node mode (validator|full|seed) in `tendermint init [mode]` +* The `--fast-sync` command line option has been renamed to `--blocksync.enable` + * If you had previously used `tendermint gen_node_key` to generate a new node key, keep in mind that it no longer saves the output to a file. You can use `tendermint init validator` or pipe the output of `tendermint gen_node_key` to @@ -74,8 +85,8 @@ if needed. ### API Changes -The p2p layer was reimplemented as part of the 0.35 release cycle, and -all reactors were refactored. As part of that work these +The p2p layer was reimplemented as part of the 0.35 release cycle and +all reactors were refactored to accomodate the change. As part of that work these implementations moved into the `internal` package and are no longer considered part of the public Go API of tendermint. These packages are: @@ -98,13 +109,11 @@ will need to change to accommodate these changes. Most notably: longer exported and have been replaced with `node.New` and `node.NewDefault` which provide more functional interfaces. -### RPC changes - -#### gRPC Support +### gRPC Support Mark gRPC in the RPC layer as deprecated and to be removed in 0.36. -#### Peer Management Interface +### Peer Management Interface When running with the new P2P Layer, the methods `UnsafeDialSeeds` and `UnsafeDialPeers` RPC methods will always return an error. They are @@ -116,6 +125,58 @@ method changes in this release to accommodate the different way that the new stack tracks data about peers. This change affects users of both stacks. +### Using the updated p2p library + +The P2P library was reimplemented in this release. The new implementation is +enabled by default in this version of Tendermint. The legacy implementation is still +included in this version of Tendermint as a backstop to work around unforeseen +production issues. The new and legacy version are interoperable. If necessary, +you can enable the legacy implementation in the server configuration file. + +To make use of the legacy P2P implemementation add or update the following field of +your server's configuration file under the `[p2p]` section: + +```toml +[p2p] +... +use-legacy = true +... +``` + +If you need to do this, please consider filing an issue in the Tendermint repository +to let us know why. We plan to remove the legacy P2P code in the next (v0.36) release. + +#### New p2p queue types + +The new p2p implementation enables selection of the queue type to be used for +passing messages between peers. + +The following values may be used when selecting which queue type to use: + +* `fifo`: (**default**) An unbuffered and lossless queue that passes messages through +in the order in which they were received. + +* `priority`: A priority queue of messages. + +* `wdrr`: A queue implementing the Weighted Deficit Round Robin algorithm. A +weighted deficit round robin queue is created per peer. Each queue contains a +separate 'flow' for each of the channels of communication that exist between any two +peers. Tendermint maintains a channel per message type between peers. Each WDRR +queue maintains a shared buffered with a fixed capacity through which messages on different +flows are passed. +For more information on WDRR scheduling, see: https://en.wikipedia.org/wiki/Deficit_round_robin + +To select a queue type, add or update the following field under the `[p2p]` +section of your server's configuration file. + +```toml +[p2p] +... +queue-type = wdrr +... +``` + + ### Support for Custom Reactor and Mempool Implementations The changes to p2p layer removed existing support for custom diff --git a/cmd/tendermint/commands/run_node.go b/cmd/tendermint/commands/run_node.go index 97d6197a2..c174fd967 100644 --- a/cmd/tendermint/commands/run_node.go +++ b/cmd/tendermint/commands/run_node.go @@ -3,6 +3,8 @@ package commands import ( "bytes" "crypto/sha256" + "errors" + "flag" "fmt" "io" "os" @@ -33,7 +35,22 @@ func AddNodeFlags(cmd *cobra.Command) { "socket address to listen on for connections from external priv-validator process") // node flags - cmd.Flags().Bool("fast-sync", config.FastSyncMode, "fast blockchain syncing") + cmd.Flags().Bool("blocksync.enable", config.BlockSync.Enable, "enable fast blockchain syncing") + + // TODO (https://github.com/tendermint/tendermint/issues/6908): remove this check after the v0.35 release cycle + // This check was added to give users an upgrade prompt to use the new flag for syncing. + // + // The pflag package does not have a native way to print a depcrecation warning + // and return an error. This logic was added to print a deprecation message to the user + // and then crash if the user attempts to use the old --fast-sync flag. + fs := flag.NewFlagSet("", flag.ExitOnError) + fs.Func("fast-sync", "deprecated", + func(string) error { + return errors.New("--fast-sync has been deprecated, please use --blocksync.enable") + }) + cmd.Flags().AddGoFlagSet(fs) + + cmd.Flags().MarkHidden("fast-sync") //nolint:errcheck cmd.Flags().BytesHexVar( &genesisHash, "genesis-hash", @@ -158,7 +175,7 @@ func checkGenesisHash(config *cfg.Config) error { // Compare with the flag. if !bytes.Equal(genesisHash, actualHash) { return fmt.Errorf( - "--genesis_hash=%X does not match %s hash: %X", + "--genesis-hash=%X does not match %s hash: %X", genesisHash, config.GenesisFile(), actualHash) } diff --git a/config/config.go b/config/config.go index 7e8dd5976..dfc4836da 100644 --- a/config/config.go +++ b/config/config.go @@ -76,7 +76,7 @@ type Config struct { P2P *P2PConfig `mapstructure:"p2p"` Mempool *MempoolConfig `mapstructure:"mempool"` StateSync *StateSyncConfig `mapstructure:"statesync"` - BlockSync *BlockSyncConfig `mapstructure:"fastsync"` + BlockSync *BlockSyncConfig `mapstructure:"blocksync"` Consensus *ConsensusConfig `mapstructure:"consensus"` TxIndex *TxIndexConfig `mapstructure:"tx-index"` Instrumentation *InstrumentationConfig `mapstructure:"instrumentation"` @@ -152,7 +152,7 @@ func (cfg *Config) ValidateBasic() error { return fmt.Errorf("error in [statesync] section: %w", err) } if err := cfg.BlockSync.ValidateBasic(); err != nil { - return fmt.Errorf("error in [fastsync] section: %w", err) + return fmt.Errorf("error in [blocksync] section: %w", err) } if err := cfg.Consensus.ValidateBasic(); err != nil { return fmt.Errorf("error in [consensus] section: %w", err) @@ -194,12 +194,6 @@ type BaseConfig struct { //nolint: maligned // - No priv_validator_key.json, priv_validator_state.json Mode string `mapstructure:"mode"` - // If this node is many blocks behind the tip of the chain, FastSync - // allows them to catchup quickly by downloading blocks in parallel - // and verifying their commits - // TODO: This should be moved to the blocksync config - FastSyncMode bool `mapstructure:"fast-sync"` - // Database backend: goleveldb | cleveldb | boltdb | rocksdb // * goleveldb (github.com/syndtr/goleveldb - most popular implementation) // - pure go @@ -242,23 +236,24 @@ type BaseConfig struct { //nolint: maligned // If true, query the ABCI app on connecting to a new peer // so the app can decide if we should keep the connection or not FilterPeers bool `mapstructure:"filter-peers"` // false + + Other map[string]interface{} `mapstructure:",remain"` } // DefaultBaseConfig returns a default base configuration for a Tendermint node func DefaultBaseConfig() BaseConfig { return BaseConfig{ - Genesis: defaultGenesisJSONPath, - NodeKey: defaultNodeKeyPath, - Mode: defaultMode, - Moniker: defaultMoniker, - ProxyApp: "tcp://127.0.0.1:26658", - ABCI: "socket", - LogLevel: DefaultLogLevel, - LogFormat: log.LogFormatPlain, - FastSyncMode: true, - FilterPeers: false, - DBBackend: "goleveldb", - DBPath: "data", + Genesis: defaultGenesisJSONPath, + NodeKey: defaultNodeKeyPath, + Mode: defaultMode, + Moniker: defaultMoniker, + ProxyApp: "tcp://127.0.0.1:26658", + ABCI: "socket", + LogLevel: DefaultLogLevel, + LogFormat: log.LogFormatPlain, + FilterPeers: false, + DBBackend: "goleveldb", + DBPath: "data", } } @@ -268,7 +263,6 @@ func TestBaseConfig() BaseConfig { cfg.chainID = "tendermint_test" cfg.Mode = ModeValidator cfg.ProxyApp = "kvstore" - cfg.FastSyncMode = false cfg.DBBackend = "memdb" return cfg } @@ -345,6 +339,28 @@ func (cfg BaseConfig) ValidateBasic() error { return fmt.Errorf("unknown mode: %v", cfg.Mode) } + // TODO (https://github.com/tendermint/tendermint/issues/6908) remove this check after the v0.35 release cycle. + // This check was added to give users an upgrade prompt to use the new + // configuration option in v0.35. In future release cycles they should no longer + // be using this configuration parameter so the check can be removed. + // The cfg.Other field can likely be removed at the same time if it is not referenced + // elsewhere as it was added to service this check. + if fs, ok := cfg.Other["fastsync"]; ok { + if _, ok := fs.(map[string]interface{}); ok { + return fmt.Errorf("a configuration section named 'fastsync' was found in the " + + "configuration file. The 'fastsync' section has been renamed to " + + "'blocksync', please update the 'fastsync' field in your configuration file to 'blocksync'") + } + } + if fs, ok := cfg.Other["fast-sync"]; ok { + if fs != "" { + return fmt.Errorf("a parameter named 'fast-sync' was found in the " + + "configuration file. The parameter to enable or disable quickly syncing with a blockchain" + + "has moved to the [blocksync] section of the configuration file as blocksync.enable. " + + "Please move the 'fast-sync' field in your configuration file to 'blocksync.enable'") + } + } + return nil } @@ -884,15 +900,46 @@ func (cfg *MempoolConfig) ValidateBasic() error { // StateSyncConfig defines the configuration for the Tendermint state sync service type StateSyncConfig struct { - Enable bool `mapstructure:"enable"` - TempDir string `mapstructure:"temp-dir"` - RPCServers []string `mapstructure:"rpc-servers"` - TrustPeriod time.Duration `mapstructure:"trust-period"` - TrustHeight int64 `mapstructure:"trust-height"` - TrustHash string `mapstructure:"trust-hash"` - DiscoveryTime time.Duration `mapstructure:"discovery-time"` + // State sync rapidly bootstraps a new node by discovering, fetching, and restoring a + // state machine snapshot from peers instead of fetching and replaying historical + // blocks. Requires some peers in the network to take and serve state machine + // snapshots. State sync is not attempted if the node has any local state + // (LastBlockHeight > 0). The node will have a truncated block history, starting from + // the height of the snapshot. + Enable bool `mapstructure:"enable"` + + // State sync uses light client verification to verify state. This can be done either + // through the P2P layer or the RPC layer. Set this to true to use the P2P layer. If + // false (default), the RPC layer will be used. + UseP2P bool `mapstructure:"use-p2p"` + + // If using RPC, at least two addresses need to be provided. They should be compatible + // with net.Dial, for example: "host.example.com:2125". + RPCServers []string `mapstructure:"rpc-servers"` + + // The hash and height of a trusted block. Must be within the trust-period. + TrustHeight int64 `mapstructure:"trust-height"` + TrustHash string `mapstructure:"trust-hash"` + + // The trust period should be set so that Tendermint can detect and gossip + // misbehavior before it is considered expired. For chains based on the Cosmos SDK, + // one day less than the unbonding period should suffice. + TrustPeriod time.Duration `mapstructure:"trust-period"` + + // Time to spend discovering snapshots before initiating a restore. + DiscoveryTime time.Duration `mapstructure:"discovery-time"` + + // Temporary directory for state sync snapshot chunks, defaults to os.TempDir(). + // The synchronizer will create a new, randomly named directory within this directory + // and remove it when the sync is complete. + TempDir string `mapstructure:"temp-dir"` + + // The timeout duration before re-requesting a chunk, possibly from a different + // peer (default: 15 seconds). ChunkRequestTimeout time.Duration `mapstructure:"chunk-request-timeout"` - Fetchers int32 `mapstructure:"fetchers"` + + // The number of concurrent chunk and block fetchers to run (default: 4). + Fetchers int32 `mapstructure:"fetchers"` } func (cfg *StateSyncConfig) TrustHashBytes() []byte { @@ -921,49 +968,51 @@ func TestStateSyncConfig() *StateSyncConfig { // ValidateBasic performs basic validation. func (cfg *StateSyncConfig) ValidateBasic() error { - if cfg.Enable { - if len(cfg.RPCServers) == 0 { - return errors.New("rpc-servers is required") - } + if !cfg.Enable { + return nil + } + // If we're not using the P2P stack then we need to validate the + // RPCServers + if !cfg.UseP2P { if len(cfg.RPCServers) < 2 { - return errors.New("at least two rpc-servers entries is required") + return errors.New("at least two rpc-servers must be specified") } for _, server := range cfg.RPCServers { - if len(server) == 0 { + if server == "" { return errors.New("found empty rpc-servers entry") } } + } - if cfg.DiscoveryTime != 0 && cfg.DiscoveryTime < 5*time.Second { - return errors.New("discovery time must be 0s or greater than five seconds") - } + if cfg.DiscoveryTime != 0 && cfg.DiscoveryTime < 5*time.Second { + return errors.New("discovery time must be 0s or greater than five seconds") + } - if cfg.TrustPeriod <= 0 { - return errors.New("trusted-period is required") - } + if cfg.TrustPeriod <= 0 { + return errors.New("trusted-period is required") + } - if cfg.TrustHeight <= 0 { - return errors.New("trusted-height is required") - } + if cfg.TrustHeight <= 0 { + return errors.New("trusted-height is required") + } - if len(cfg.TrustHash) == 0 { - return errors.New("trusted-hash is required") - } + if len(cfg.TrustHash) == 0 { + return errors.New("trusted-hash is required") + } - _, err := hex.DecodeString(cfg.TrustHash) - if err != nil { - return fmt.Errorf("invalid trusted-hash: %w", err) - } + _, err := hex.DecodeString(cfg.TrustHash) + if err != nil { + return fmt.Errorf("invalid trusted-hash: %w", err) + } - if cfg.ChunkRequestTimeout < 5*time.Second { - return errors.New("chunk-request-timeout must be at least 5 seconds") - } + if cfg.ChunkRequestTimeout < 5*time.Second { + return errors.New("chunk-request-timeout must be at least 5 seconds") + } - if cfg.Fetchers <= 0 { - return errors.New("fetchers is required") - } + if cfg.Fetchers <= 0 { + return errors.New("fetchers is required") } return nil @@ -972,13 +1021,18 @@ func (cfg *StateSyncConfig) ValidateBasic() error { //----------------------------------------------------------------------------- // BlockSyncConfig (formerly known as FastSync) defines the configuration for the Tendermint block sync service +// If this node is many blocks behind the tip of the chain, BlockSync +// allows them to catchup quickly by downloading blocks in parallel +// and verifying their commits. type BlockSyncConfig struct { + Enable bool `mapstructure:"enable"` Version string `mapstructure:"version"` } // DefaultBlockSyncConfig returns a default configuration for the block sync service func DefaultBlockSyncConfig() *BlockSyncConfig { return &BlockSyncConfig{ + Enable: true, Version: BlockSyncV0, } } diff --git a/config/toml.go b/config/toml.go index 76058802c..1cb3c0615 100644 --- a/config/toml.go +++ b/config/toml.go @@ -97,11 +97,6 @@ moniker = "{{ .BaseConfig.Moniker }}" # - No priv_validator_key.json, priv_validator_state.json mode = "{{ .BaseConfig.Mode }}" -# If this node is many blocks behind the tip of the chain, FastSync -# allows them to catchup quickly by downloading blocks in parallel -# and verifying their commits -fast-sync = {{ .BaseConfig.FastSyncMode }} - # Database backend: goleveldb | cleveldb | boltdb | rocksdb | badgerdb # * goleveldb (github.com/syndtr/goleveldb - most popular implementation) # - pure go @@ -270,7 +265,7 @@ pprof-laddr = "{{ .RPC.PprofListenAddress }}" ####################################################### [p2p] -# Enable the new p2p layer. +# Enable the legacy p2p layer. use-legacy = {{ .P2P.UseLegacy }} # Select the p2p internal queue @@ -305,6 +300,7 @@ persistent-peers = "{{ .P2P.PersistentPeers }}" upnp = {{ .P2P.UPNP }} # Path to address book +# TODO: Remove once p2p refactor is complete in favor of peer store. addr-book-file = "{{ js .P2P.AddrBook }}" # Set true for strict address routability rules @@ -330,6 +326,8 @@ max-connections = {{ .P2P.MaxConnections }} max-incoming-connection-attempts = {{ .P2P.MaxIncomingConnectionAttempts }} # List of node IDs, to which a connection will be (re)established ignoring any existing limits +# TODO: Remove once p2p refactor is complete. +# ref: https://github.com/tendermint/tendermint/issues/5670 unconditional-peer-ids = "{{ .P2P.UnconditionalPeerIDs }}" # Maximum pause when redialing a persistent peer (if zero, exponential backoff is used) @@ -426,22 +424,30 @@ ttl-num-blocks = {{ .Mempool.TTLNumBlocks }} # starting from the height of the snapshot. enable = {{ .StateSync.Enable }} -# RPC servers (comma-separated) for light client verification of the synced state machine and -# retrieval of state data for node bootstrapping. Also needs a trusted height and corresponding -# header hash obtained from a trusted source, and a period during which validators can be trusted. -# -# For Cosmos SDK-based chains, trust-period should usually be about 2/3 of the unbonding time (~2 -# weeks) during which they can be financially punished (slashed) for misbehavior. +# State sync uses light client verification to verify state. This can be done either through the +# P2P layer or RPC layer. Set this to true to use the P2P layer. If false (default), RPC layer +# will be used. +use-p2p = {{ .StateSync.UseP2P }} + +# If using RPC, at least two addresses need to be provided. They should be compatible with net.Dial, +# for example: "host.example.com:2125" rpc-servers = "{{ StringsJoin .StateSync.RPCServers "," }}" + +# The hash and height of a trusted block. Must be within the trust-period. trust-height = {{ .StateSync.TrustHeight }} trust-hash = "{{ .StateSync.TrustHash }}" + +# The trust period should be set so that Tendermint can detect and gossip misbehavior before +# it is considered expired. For chains based on the Cosmos SDK, one day less than the unbonding +# period should suffice. trust-period = "{{ .StateSync.TrustPeriod }}" # Time to spend discovering snapshots before initiating a restore. discovery-time = "{{ .StateSync.DiscoveryTime }}" -# Temporary directory for state sync snapshot chunks, defaults to the OS tempdir (typically /tmp). -# Will create a new, randomly named directory within, and remove it when done. +# Temporary directory for state sync snapshot chunks, defaults to os.TempDir(). +# The synchronizer will create a new, randomly named directory within this directory +# and remove it when the sync is complete. temp-dir = "{{ .StateSync.TempDir }}" # The timeout duration before re-requesting a chunk, possibly from a different @@ -454,10 +460,15 @@ fetchers = "{{ .StateSync.Fetchers }}" ####################################################### ### Block Sync Configuration Connections ### ####################################################### -[fastsync] +[blocksync] + +# If this node is many blocks behind the tip of the chain, BlockSync +# allows them to catchup quickly by downloading blocks in parallel +# and verifying their commits +enable = {{ .BlockSync.Enable }} # Block Sync version to use: -# 1) "v0" (default) - the legacy block sync implementation +# 1) "v0" (default) - the standard Block Sync implementation # 2) "v2" - DEPRECATED, please use v0 version = "{{ .BlockSync.Version }}" diff --git a/config/toml_test.go b/config/toml_test.go index 418cea8fa..ccf818d65 100644 --- a/config/toml_test.go +++ b/config/toml_test.go @@ -36,9 +36,7 @@ func TestEnsureRoot(t *testing.T) { data, err := ioutil.ReadFile(filepath.Join(tmpDir, defaultConfigFilePath)) require.Nil(err) - if !checkConfig(string(data)) { - t.Fatalf("config file missing some information") - } + checkConfig(t, string(data)) ensureFiles(t, tmpDir, "data") } @@ -57,9 +55,7 @@ func TestEnsureTestRoot(t *testing.T) { data, err := ioutil.ReadFile(filepath.Join(rootDir, defaultConfigFilePath)) require.Nil(err) - if !checkConfig(string(data)) { - t.Fatalf("config file missing some information") - } + checkConfig(t, string(data)) // TODO: make sure the cfg returned and testconfig are the same! baseConfig := DefaultBaseConfig() @@ -67,16 +63,15 @@ func TestEnsureTestRoot(t *testing.T) { ensureFiles(t, rootDir, defaultDataDir, baseConfig.Genesis, pvConfig.Key, pvConfig.State) } -func checkConfig(configFile string) bool { - var valid bool - +func checkConfig(t *testing.T, configFile string) { + t.Helper() // list of words we expect in the config var elems = []string{ "moniker", "seeds", "proxy-app", - "fast_sync", - "create_empty_blocks", + "blocksync", + "create-empty-blocks", "peer", "timeout", "broadcast", @@ -89,10 +84,7 @@ func checkConfig(configFile string) bool { } for _, e := range elems { if !strings.Contains(configFile, e) { - valid = false - } else { - valid = true + t.Errorf("config file was expected to contain %s but did not", e) } } - return valid } diff --git a/crypto/secp256k1/secp256k1_test.go b/crypto/secp256k1/secp256k1_test.go index 83249ef6a..f8bf29971 100644 --- a/crypto/secp256k1/secp256k1_test.go +++ b/crypto/secp256k1/secp256k1_test.go @@ -36,8 +36,7 @@ func TestPubKeySecp256k1Address(t *testing.T) { addrBbz, _, _ := base58.CheckDecode(d.addr) addrB := crypto.Address(addrBbz) - var priv secp256k1.PrivKey = secp256k1.PrivKey(privB) - + priv := secp256k1.PrivKey(privB) pubKey := priv.PubKey() pubT, _ := pubKey.(secp256k1.PubKey) pub := pubT diff --git a/docs/app-dev/indexing-transactions.md b/docs/app-dev/indexing-transactions.md index 15108cb05..b8b06d01b 100644 --- a/docs/app-dev/indexing-transactions.md +++ b/docs/app-dev/indexing-transactions.md @@ -62,7 +62,7 @@ be turned off regardless of other values provided. #### KV The `kv` indexer type is an embedded key-value store supported by the main -underling Tendermint database. Using the `kv` indexer type allows you to query +underlying Tendermint database. Using the `kv` indexer type allows you to query for block and transaction events directly against Tendermint's RPC. However, the query syntax is limited and so this indexer type might be deprecated or removed entirely in the future. diff --git a/docs/nodes/configuration.md b/docs/nodes/configuration.md index b5259f93f..ffdbaffa2 100644 --- a/docs/nodes/configuration.md +++ b/docs/nodes/configuration.md @@ -36,10 +36,6 @@ proxy-app = "tcp://127.0.0.1:26658" # A custom human readable name for this node moniker = "anonymous" -# If this node is many blocks behind the tip of the chain, BlockSync -# allows them to catchup quickly by downloading blocks in parallel -# and verifying their commits -fast-sync = true # Mode of Node: full | validator | seed (default: "validator") # * validator node (default) @@ -356,11 +352,16 @@ temp-dir = "" ####################################################### ### BlockSync Configuration Connections ### ####################################################### -[fastsync] +[blocksync] + +# If this node is many blocks behind the tip of the chain, BlockSync +# allows them to catchup quickly by downloading blocks in parallel +# and verifying their commits +enable = true # Block Sync version to use: -# 1) "v0" (default) - the legacy block sync implementation -# 2) "v2" - complete redesign of v0, optimized for testability & readability +# 1) "v0" (default) - the standard block sync implementation +# 2) "v2" - DEPRECATED, please use v0 version = "v0" ####################################################### diff --git a/docs/rfc/README.md b/docs/rfc/README.md index 97f9d8306..3af5a33e8 100644 --- a/docs/rfc/README.md +++ b/docs/rfc/README.md @@ -38,6 +38,9 @@ sections. ## Table of Contents - [RFC-000: P2P Roadmap](./rfc-000-p2p-roadmap.rst) +- [RFC-001: Storage Engines](./rfc-001-storage-engine.rst) +- [RFC-002: Interprocess Communication](./rfc-002-ipc-ecosystem.md) - [RFC-003: Performance Taxonomy](./rfc-003-performance-questions.md) +- [RFC-004: E2E Test Framework Enhancements](./rfc-004-e2e-framework.md) diff --git a/docs/rfc/rfc-001-storage-engine.rst b/docs/rfc/rfc-001-storage-engine.rst new file mode 100644 index 000000000..560e8a8b3 --- /dev/null +++ b/docs/rfc/rfc-001-storage-engine.rst @@ -0,0 +1,179 @@ +=========================================== +RFC 001: Storage Engines and Database Layer +=========================================== + +Changelog +--------- + +- 2021-04-19: Initial Draft (gist) +- 2021-09-02: Migrated to RFC folder, with some updates + +Abstract +-------- + +The aspect of Tendermint that's responsible for persistence and storage (often +"the database" internally) represents a bottle neck in the architecture of the +platform, that the 0.36 release presents a good opportunity to correct. The +current storage engine layer provides a great deal of flexibility that is +difficult for users to leverage or benefit from, while also making it harder +for Tendermint Core developers to deliver improvements on storage engine. This +RFC discusses the possible improvements to this layer of the system. + +Background +---------- + +Tendermint has a very thin common wrapper that makes Tendermint itself +(largely) agnostic to the data storage layer (within the realm of the popular +key-value/embedded databases.) This flexibility is not particularly useful: +the benefits of a specific database engine in the context of Tendermint is not +particularly well understood, and the maintenance burden for multiple backends +is not commensurate with the benefit provided. Additionally, because the data +storage layer is handled generically, and most tests run with an in-memory +framework, it's difficult to take advantage of any higher-level features of a +database engine. + +Ideally, developers within Tendermint will be able to interact with persisted +data via an interface that can function, approximately like an object +store, and this storage interface will be able to accommodate all existing +persistence workloads (e.g. block storage, local peer management information +like the "address book", crash-recovery log like the WAL.) In addition to +providing a more ergonomic interface and new semantics, by selecting a single +storage engine tendermint can use native durability and atomicity features of +the storage engine and simplify its own implementations. + +Data Access Patterns +~~~~~~~~~~~~~~~~~~~~ + +Tendermint's data access patterns have the following characteristics: + +- aggregate data size often exceeds memory. + +- data is rarely mutated after it's written for most data (e.g. blocks), but + small amounts of working data is persisted by nodes and is frequently + mutated (e.g. peer information, validator information.) + +- read patterns can be quite random. + +- crash resistance and crash recovery, provided by write-ahead-logs (in + consensus, and potentially for the mempool) should allow the system to + resume work after an unexpected shut down. + +Project Goals +~~~~~~~~~~~~~ + +As we think about replacing the current persistence layer, we should consider +the following high level goals: + +- drop dependencies on storage engines that have a CGo dependency. + +- encapsulate data format and data storage from higher-level services + (e.g. reactors) within tendermint. + +- select a storage engine that does not incur any additional operational + complexity (e.g. database should be embedded.) + +- provide database semantics with sufficient ACID, snapshots, and + transactional support. + +Open Questions +~~~~~~~~~~~~~~ + +The following questions remain: + +- what kind of data-access concurrency does tendermint require? + +- would tendermint users SDK/etc. benefit from some shared database + infrastructure? + + - In earlier conversations it seemed as if the SDK has selected Badger and + RocksDB for their storage engines, and it might make sense to be able to + (optionally) pass a handle to a Badger instance between the libraries in + some cases. + +- what are typical data sizes, and what kinds of memory sizes can we expect + operators to be able to provide? + +- in addition to simple persistence, what kind of additional semantics would + tendermint like to enjoy (e.g. transactional semantics, unique constraints, + indexes, in-place-updates, etc.)? + +Decision Framework +~~~~~~~~~~~~~~~~~~ + +Given the constraint of removing the CGo dependency, the decision is between +"badger" and "boltdb" (in the form of the etcd/CoreOS fork,) as low level. On +top of this and somewhat orthogonally, we must also decide on the interface to +the database and how the larger application will have to interact with the +database layer. Users of the data layer shouldn't ever need to interact with +raw byte slices from the database, and should mostly have the experience of +interacting with Go-types. + +Badger is more consistently developed and has a broader feature set than +Bolt. At the same time, Badger is likely more memory intensive and may have +more overhead in terms of open file handles given it's model. At first glance, +Badger is the obvious choice: it's actively developed and it has a lot of +features that could be useful. Bolt is not without some benefits: it's stable +and is maintained by the etcd folks, it's simpler model (single memory mapped +file, etc,) may be easier to reason about. + +I propose that we consider the following specific questions about storage +engines: + +- does Badger's evolving development, which may result in data file format + changes in the future, and could restrict our access to using the latest + version of the library between major upgrades, present a problem? + +- do we do we have goals/concerns about memory footprint that Badger may + prevent us from hitting, particularly as data sets grow over time? + +- what kind of additional tooling might we need/like to build (dump/restore, + etc.)? + +- do we want to run unit/integration tests against a data files on disk rather + than relying exclusively on the memory database? + +Project Scope +~~~~~~~~~~~~~ + +This project will consist of the following aspects: + +- selecting a storage engine, and modifying the tendermint codebase to + disallow any configuration of the storage engine outside of the tendermint. + +- remove the dependency on the current tm-db interfaces and replace with some + internalized, safe, and ergonomic interface for data persistence with all + required database semantics. + +- update core tendermint code to use the new interface and data tools. + +Next Steps +~~~~~~~~~~ + +- circulate the RFC, and discuss options with appropriate stakeholders. + +- write brief ADR to summarize decisions around technical decisions reached + during the RFC phase. + +References +---------- + +- `bolddb `_ +- `badger `_ +- `badgerdb overview `_ +- `botldb overview `_ +- `boltdb vs badger `_ +- `bolthold `_ +- `badgerhold `_ +- `Pebble `_ +- `SDK Issue Regarding IVAL `_ +- `SDK Discussion about SMT/IVAL `_ + +Discussion +---------- + +- All things being equal, my tendency would be to use badger, with badgerhold + (if that makes sense) for its ergonomics and indexing capabilities, which + will require some small selection of wrappers for better write transaction + support. This is a weakly held tendency/belief and I think it would be + useful for the RFC process to build consensus (or not) around this basic + assumption. diff --git a/docs/rfc/rfc-002-ipc-ecosystem.md b/docs/rfc/rfc-002-ipc-ecosystem.md new file mode 100644 index 000000000..9b51beb7f --- /dev/null +++ b/docs/rfc/rfc-002-ipc-ecosystem.md @@ -0,0 +1,420 @@ +# RFC 002: Interprocess Communication (IPC) in Tendermint + +## Changelog + +- 08-Sep-2021: Initial draft (@creachadair). + + +## Abstract + +Communication in Tendermint among consensus nodes, applications, and operator +tools all use different message formats and transport mechanisms. In some +cases there are multiple options. Having all these options complicates both the +code and the developer experience, and hides bugs. To support a more robust, +trustworthy, and usable system, we should document which communication paths +are essential, which could be removed or reduced in scope, and what we can +improve for the most important use cases. + +This document proposes a variety of possible improvements of varying size and +scope. Specific design proposals should get their own documentation. + + +## Background + +The Tendermint state replication engine has a complex IPC footprint. + +1. Consensus nodes communicate with each other using a networked peer-to-peer + message-passing protocol. + +2. Consensus nodes communicate with the application whose state is being + replicated via the [Application BlockChain Interface (ABCI)][abci]. + +3. Consensus nodes export a network-accessible [RPC service][rpc-service] to + support operations (bootstrapping, debugging) and synchronization of [light clients][light-client]. + This interface is also used by the [`tendermint` CLI][tm-cli]. + +4. Consensus nodes export a gRPC service exposing a subset of the methods of + the RPC service described by (3). This was intended to simplify the + implementation of tools that already use gRPC to communicate with an + application (via the Cosmos SDK), and wanted to also talk to the consensus + node without implementing yet another RPC protocol. + + The gRPC interface to the consensus node has been deprecated and is slated + for removal in the forthcoming Tendermint v0.36 release. + +5. Consensus nodes may optionally communicate with a "remote signer" that holds + a validator key and can provide public keys and signatures to the consensus + node. One of the stated goals of this configuration is to allow the signer + to be run on a private network, separate from the consensus node, so that a + compromise of the consensus node from the public network would be less + likely to expose validator keys. + +## Discussion: Transport Mechanisms + +### Remote Signer Transport + +A remote signer communicates with the consensus node in one of two ways: + +1. "Raw": Using a TCP or Unix-domain socket which carries varint-prefixed + protocol buffer messages. In this mode, the consensus node is the server, + and the remote signer is the client. + + This mode has been deprecated, and is intended to be removed. + +2. gRPC: This mode uses the same protobuf messages as "Raw" node, but uses a + standard encrypted gRPC HTTP/2 stub as the transport. In this mode, the + remote signer is the server and the consensus node is the client. + + +### ABCI Transport + +In ABCI, the _application_ is the server, and the Tendermint consensus engine +is the client. Most applications implement the server using the [Cosmos SDK][cosmos-sdk], +which handles low-level details of the ABCI interaction and provides a +higher-level interface to the rest of the application. The SDK is written in Go. + +Beneath the SDK, the application communicates with Tendermint core in one of +two ways: + +- In-process direct calls (for applications written in Go and compiled against + the Tendermint code). This is an optimization for the common case where an + application is written in Go, to save on the overhead of marshaling and + unmarshaling requests and responses within the same process: + [`abci/client/local_client.go`][local-client] + +- A custom remote procedure protocol built on wire-format protobuf messages + using a socket (the "socket protocol"): [`abci/server/socket_server.go`][socket-server] + +The SDK also provides a [gRPC service][sdk-grpc] accessible from outside the +application, allowing transactions to be broadcast to the network, look up +transactions, and simulate transaction costs. + + +### RPC Transport + +The consensus node RPC service allows callers to query consensus parameters +(genesis data, transactions, commits), node status (network info, health +checks), application state (abci_query, abci_info), mempool state, and other +attributes of the node and its application. The service also provides methods +allowing transactions and evidence to be injected ("broadcast") into the +blockchain. + +The RPC service is exposed in several ways: + +- HTTP GET: Queries may be sent as URI parameters, with method names in the path. + +- HTTP POST: Queries may be sent as JSON-RPC request messages in the body of an + HTTP POST request. The server uses a custom implementation of JSON-RPC that + is not fully compatible with the [JSON-RPC 2.0 spec][json-rpc], but handles + the common cases. + +- Websocket: Queries may be sent as JSON-RPC request messages via a websocket. + This transport uses more or less the same JSON-RPC plumbing as the HTTP POST + handler. + + The websocket endpoint also includes three methods that are _only_ exported + via websocket, which appear to support event subscription. + +- gRPC: A subset of queries may be issued in protocol buffer format to the gRPC + interface described above under (4). As noted, this endpoint is deprecated + and will be removed in v0.36. + +### Opportunities for Simplification + +**Claim:** There are too many IPC mechanisms. + +The preponderance of ABCI usage is via the Cosmos SDK, which means the +application and the consensus node are compiled together into a single binary, +and the consensus node calls the ABCI methods of the application directly as Go +functions. + +We also need a true IPC transport to support ABCI applications _not_ written in +Go. There are also several known applications written in Rust, for example +(including [Anoma](https://github.com/anoma/anoma), Penumbra, +[Oasis](https://github.com/oasisprotocol/oasis-core), Twilight, and +[Nomic](https://github.com/nomic-io/nomic)). Ideally we will have at most one +such transport "built-in": More esoteric cases can be handled by a custom proxy. +Pragmatically, gRPC is probably the right choice here. + +The primary consumers of the multi-headed "RPC service" today are the light +client and the `tendermint` command-line client. There is probably some local +use via curl, but I expect that is mostly ad hoc. Ethan reports that nodes are +often configured with the ports to the RPC service blocked, which is good for +security but complicates use by the light client. + +### Context: Remote Signer Issues + +Since the remote signer needs a secure communication channel to exchange keys +and signatures, and is expected to run truly remotely from the node (i.e., on a +separate physical server), there is not a whole lot we can do here. We should +finish the deprecation and removal of the "raw" socket protocol between the +consensus node and remote signers, but the use of gRPC is appropriate. + +The main improvement we can make is to simplify the implementation quite a bit, +once we no longer need to support both "raw" and gRPC transports. + +### Context: ABCI Issues + +In the original design of ABCI, the presumption was that all access to the +application should be mediated by the consensus node. The idea is that outside +access could change application state and corrupt the consensus process, which +relies on the application to be deterministic. Of course, even without outside +access an application could behave nondeterministically, but allowing other +programs to send it requests was seen as courting trouble. + +Conversely, users noted that most of the time, tools written for a particular +application don't want to talk to the consensus module directly. The +application "owns" the state machine the consensus engine is replicating, so +tools that care about application state should talk to the application. +Otherwise, they would have to bake in knowledge about Tendermint (e.g., its +interfaces and data structures) just because of the mediation. + +For clients to talk directly to the application, however, there is another +concern: The consensus node is the ABCI _client_, so it is inconvenient for the +application to "push" work into the consensus module via ABCI itself. The +current implementation works around this by calling the consensus node's RPC +service, which exposes an `ABCIQuery` kitchen-sink method that allows the +application a way to poke ABCI messages in the other direction. + +Without this RPC method, you could work around this (at least in principle) by +having the consensus module "poll" the application for work that needs done, +but that has unsatisfactory implications for performance and robustness, as +well as being harder to understand. + +There has apparently been discussion about trying to make a more bidirectional +communication between the consensus node and the application, but this issue +seems to still be unresolved. + +Another complication of ABCI is that it requires the application (server) to +maintain [four separate connections][abci-conn]: One for "consensus" operations +(BeginBlock, EndBlock, DeliverTx, Commit), one for "mempool" operations, one +for "query" operations, and one for "snapshot" (state synchronization) operations. +The rationale seems to have been that these groups of operations should be able +to proceed concurrently with each other. In practice, it results in a very complex +state management problem to coordinate state updates between the separate streams. +While application authors in Go are mostly insulated from that complexity by the +Cosmos SDK, the plumbing to maintain those separate streams is complicated, hard +to understand, and we suspect it contains concurrency bugs and/or lock contention +issues affecting performance that are subtle and difficult to pin down. + +Even without changing the semantics of any ABCI operations, this code could be +made smaller and easier to debug by separating the management of concurrency +and locking from the IPC transport: If all requests and responses are routed +through one connection, the server can explicitly maintain priority queues for +requests and responses, and make less-conservative decisions about when locks +are (or aren't) required to synchronize state access. With independent queues, +the server must lock conservatively, and no optimistic scheduling is practical. + +This would be a tedious implementation change, but should be achievable without +breaking any of the existing interfaces. More importantly, it could potentially +address a lot of difficult concurrency and performance problems we currently +see anecdotally but have difficultly isolating because of how intertwined these +separate message streams are at runtime. + +TODO: Impact of ABCI++ for this topic? + +### Context: RPC Issues + +The RPC system serves several masters, and has a complex surface area. I +believe there are some improvements that can be exposed by separating some of +these concerns. + +The Tendermint light client currently uses the RPC service to look up blocks +and transactions, and to forward ABCI queries to the application. The light +client proxy uses the RPC service via a websocket. The Cosmos IBC relayer also +uses the RPC service via websocket to watch for transaction events, and uses +the `ABCIQuery` method to fetch information and proofs for posted transactions. + +Some work is already underway toward using P2P message passing rather than RPC +to synchronize light client state with the rest of the network. IBC relaying, +however, requires access to the event system, which is currently not accessible +except via the RPC interface. Event subscription _could_ be exposed via P2P, +but that is a larger project since it adds P2P communication load, and might +thus have an impact on the performance of consensus. + +If event subscription can be moved into the P2P network, we could entirely +remove the websocket transport, even for clients that still need access to the +RPC service. Until then, we may still be able to reduce the scope of the +websocket endpoint to _only_ event subscription, by moving uses of the RPC +server as a proxy to ABCI over to the gRPC interface. + +Having the RPC server still makes sense for local bootstrapping and operations, +but can be further simplified. Here are some specific proposals: + +- Remove the HTTP GET interface entirely. + +- Simplify JSON-RPC plumbing to remove unnecessary reflection and wrapping. + +- Remove the gRPC interface (this is already planned for v0.36). + +- Separate the websocket interface from the rest of the RPC service, and + restrict it to only event subscription. + + Eventually we should try to emove the websocket interface entirely, but we + will need to revisit that (probably in a new RFC) once we've done some of the + easier things. + +These changes would preserve the ability of operators to issue queries with +curl (but would require using JSON-RPC instead of URI parameters). That would +be a little less user-friendly, but for a use case that should not be that +prevalent. + +These changes would also preserve compatibility with existing JSON-RPC based +code paths like the `tendermint` CLI and the light client (even ahead of +further work to remove that dependency). + +**Design goal:** An operator should be able to disable non-local access to the +RPC server on any node in the network without impairing the ability of the +network to function for service of state replication, including light clients. + +**Design principle:** All communication required to implement and monitor the +consensus network should use P2P, including the various synchronizations. + +### Options for ABCI Transport + +The majority of current usage is in Go, and the majority of that is mediated by +the Cosmos SDK, which uses the "direct call" interface. There is probably some +opportunity to clean up the implementation of that code, notably by inverting +which interface is at the "top" of the abstraction stack (currently it acts +like an RPC interface, and escape-hatches into the direct call). However, this +general approach works fine and doesn't need to be fundamentally changed. + +For applications _not_ written in Go, the two remaining options are the +"socket" protocol (another variation on varint-prefixed protobuf messages over +an unstructured stream) and gRPC. It would be nice if we could get rid of one +of these to reduce (unneeded?) optionality. + +Since both the socket protocol and gRPC depend on protocol buffers, the +"socket" protocol is the most obvious choice to remove. While gRPC is more +complex, the set of languages that _have_ protobuf support but _lack_ gRPC +support is small. Moreover, gRPC is already widely used in the rest of the +ecosystem (including the Cosmos SDK). + +If some use case did arise later that can't work with gRPC, it would not be too +difficult for that application author to write a little proxy (in Go) that +bridges the convenient SDK APIs into a simpler protocol than gRPC. + +**Design principle:** It is better for an uncommon special case to carry the +burdens of its specialness, than to bake an escape hatch into the infrastructure. + +**Recommendation:** We should deprecate and remove the socket protocol. + +### Options for RPC Transport + +[ADR 057][adr-57] proposes using gRPC for the Tendermint RPC implementation. +This is still possible, but if we are able to simplify and decouple the +concerns as described above, I do not think it should be necessary. + +While JSON-RPC is not the best possible RPC protocol for all situations, it has +some advantages over gRPC for our domain. Specifically: + +- It is easy to call JSON-RPC manually from the command-line, which helps with + a common concern for the RPC service, local debugging and operations. + + Relatedly: JSON is relatively easy for humans to read and write, and it can + be easily copied and pasted to share sample queries and debugging results in + chat, issue comments, and so on. Ideally, the RPC service will not be used + for activities where the costs of a text protocol are important compared to + its legibility and manual usability benefits. + +- gRPC has an enormous dependency footprint for both clients and servers, and + many of the features it provides to support security and performance + (encryption, compression, streaming, etc.) are mostly irrelevant to local + use. Tendermint already needs to include a gRPC client for the remote signer, + but if we can avoid the need for a _client_ to depend on gRPC, that is a win + for usability. + +- If we intend to migrate light clients off RPC to use P2P entirely, there is + no advantage to forcing a temporary migration to gRPC along the way; and once + the light client is not dependent on the RPC service, the efficiency of the + protocol is much less important. + +- We can still get the benefits of generated data types using protocol buffers, even + without using gRPC: + + - Protobuf defines a standard JSON encoding for all message types so + languages with protobuf support do not need to worry about type mapping + oddities. + + - Using JSON means that even languages _without_ good protobuf support can + implement the protocol with a bit more work, and I expect this situation to + be rare. + +Even if a language lacks a good standard JSON-RPC mechanism, the protocol is +lightweight and can be implemented by simple send/receive over TCP or +Unix-domain sockets with no need for code generation, encryption, etc. gRPC +uses a complex HTTP/2 based transport that is not easily replicated. + +### Future Work + +The background and proposals sketched above focus on the existing structure of +Tendermint and improvements we can make in the short term. It is worthwhile to +also consider options for longer-term broader changes to the IPC ecosystem. +The following outlines some ideas at a high level: + +- **Consensus service:** Today, the application and the consensus node are + nominally connected only via ABCI. Tendermint was originally designed with + the assumption that all communication with the application should be mediated + by the consensus node. Based on further experience, however, the design goal + is now that the _application_ should be the mediator of application state. + + As noted above, however, ABCI is a client/server protocol, with the + application as the server. For outside clients that turns out to have been a + good choice, but it complicates the relationship between the application and + the consensus node: Previously transactions were entered via the node, now + they are entered via the app. + + We have worked around this by using the Tendermint RPC service to give the + application a "back channel" to the consensus node, so that it can push + transactions back into the consensus network. But the RPC service exposes a + lot of other functionality, too, including event subscription, block and + transaction queries, and a lot of node status information. + + Even if we can't easily "fix" the orientation of the ABCI relationship, we + could improve isolation by splitting out the parts of the RPC service that + the application needs as a back-channel, and sharing those _only_ with the + application. By defining a "consensus service", we could give the application + a way to talk back limited to only the capabilities it needs. This approach + has the benefit that we could do it without breaking existing use, and if we + later did "fix" the ABCI directionality, we could drop the special case + without disrupting the rest of the RPC interface. + +- **Event service:** Right now, the IBC relayer relies on the Tendermint RPC + service to provide a stream of block and transaction events, which it uses to + discover which transactions need relaying to other chains. While I think + that event subscription should eventually be handled via P2P, we could gain + some immediate benefit by splitting out event subscription from the rest of + the RPC service. + + In this model, an event subscription service would be exposed on the public + network, but on a different endpoint. This would remove the need for the RPC + service to support the websocket protocol, and would allow operators to + isolate potentially sensitive status query results from the public network. + + At the moment the relayers also use the RPC service to get block data for + synchronization, but work is already in progress to handle that concern via + the P2P layer. Once that's done, event subscription could be separated. + +Separating parts of the existing RPC service is not without cost: It might +require additional connection endpoints, for example, though it is also not too +difficult for multiple otherwise-independent services to share a connection. + +In return, though, it would become easier to reduce transport options and for +operators to independently control access to sensitive data. Considering the +viability and implications of these ideas is beyond the scope of this RFC, but +they are documented here since they follow from the background we have already +discussed. + +## References + +[abci]: https://github.com/tendermint/spec/tree/95cf253b6df623066ff7cd4074a94e7a3f147c7a/spec/abci +[rpc-service]: https://docs.tendermint.com/master/rpc/ +[light-client]: https://docs.tendermint.com/master/tendermint-core/light-client.html +[tm-cli]: https://github.com/tendermint/tendermint/tree/master/cmd/tendermint +[cosmos-sdk]: https://github.com/cosmos/cosmos-sdk/ +[local-client]: https://github.com/tendermint/tendermint/blob/master/abci/client/local_client.go +[socket-server]: https://github.com/tendermint/tendermint/blob/master/abci/server/socket_server.go +[sdk-grpc]: https://pkg.go.dev/github.com/cosmos/cosmos-sdk/types/tx#ServiceServer +[json-rpc]: https://www.jsonrpc.org/specification +[abci-conn]: https://github.com/tendermint/spec/blob/master/spec/abci/apps.md#state +[adr-57]: https://github.com/tendermint/tendermint/blob/master/docs/architecture/adr-057-RPC.md diff --git a/docs/rfc/rfc-004-e2e-framework.rst b/docs/rfc/rfc-004-e2e-framework.rst new file mode 100644 index 000000000..8508ca173 --- /dev/null +++ b/docs/rfc/rfc-004-e2e-framework.rst @@ -0,0 +1,213 @@ +======================================== +RFC 004: E2E Test Framework Enhancements +======================================== + +Changelog +--------- + +- 2021-09-14: started initial draft (@tychoish) + +Abstract +-------- + +This document discusses a series of improvements to the e2e test framework +that we can consider during the next few releases to help boost confidence in +Tendermint releases, and improve developer efficiency. + +Background +---------- + +During the 0.35 release cycle, the E2E tests were a source of great +value, helping to identify a number of bugs before release. At the same time, +the tests were not consistently passing during this time, thereby reducing +their value, and forcing the core development team to allocate time and energy +to maintaining and chasing down issues with the e2e tests and the test +harness. The experience of this release cycle calls to mind a series of +improvements to the test framework, and this document attempts to capture +these improvements, along with motivations, and potential for impact. + +Projects +-------- + +Flexible Workload Generation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Presently the e2e suite contains a single workload generation pattern, which +exists simply to ensure that the test networks have some work during their +runs. However, the shape and volume of the work is very consistent and is very +gentle to help ensure test reliability. + +We don't need a complex workload generation framework, but being able to have +a few different workload shapes available for test networks, both generated and +hand-crafted, would be useful. + +Workload patterns/configurations might include: + +- transaction targeting patterns (include light nodes, round robin, target + individual nodes) + +- variable transaction size over time. + +- transaction broadcast option (synchronously, checked, fire-and-forget, + mixed). + +- number of transactions to submit. + +- non-transaction workloads: (evidence submission, query, event subscription.) + +Configurable Generator +~~~~~~~~~~~~~~~~~~~~~~ + +The nightly e2e suite is defined by the `testnet generator +`_, +and it's difficult to add dimensions or change the focus of the test suite in +any way without modifying the implementation of the generator. If the +generator were more configurable, potentially via a file rather than in +the Go implementation, we could modify the focus of the test suite on the +fly. + +Features that we might want to configure: + +- number of test networks to generate of various topologies, to improve + coverage of different configurations. + +- test application configurations (to modify the latency of ABCI calls, etc.) + +- size of test networks. + +- workload shape and behavior. + +- initial sync and catch-up configurations. + +The workload generator currently provides runtime options for limiting the +generator to specific types of P2P stacks, and for generating multiple groups +of test cases to support parallelism. The goal is to extend this pattern and +avoid hardcoding the matrix of test cases in the generator code. Once the +testnet configuration generation behavior is configurable at runtime, +developers may be able to use the e2e framework to validate changes before +landing changes that break e2e tests a day later. + +In addition to the autogenerated suite, it might make sense to maintain a +small collection of hand-crafted cases that exercise configurations of +concern, to run as part of the nightly (or less frequent) loop. + +Implementation Plan Structure +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +As a development team, we should determine the features should impact the e2e +testing early in the development cycle, and if we intend to modify the e2e +tests to exercise a feature, we should identify this early and begin the +integration process as early as possible. + +To facilitate this, we should adopt a practice whereby we exercise specific +features that are currently under development more rigorously in the e2e +suite, and then as development stabilizes we can reduce the number or weight +of these features in the suite. + +As of 0.35 there are essentially two end to end tests: the suite of 64 +generated test networks, and the hand crafted `ci.toml` test case. The +generated test cases help provide systemtic coverage, while the `ci` run +provides coverage for a large number of features. + +Reduce Cycle Time +~~~~~~~~~~~~~~~~~ + +One of the barriers to leveraging the e2e framework, and one of the challenges +in debugging failures, is the cycle time of running a single test iteration is +quite high: 5 minutes to build the docker image, plus the time to run the test +or tests. + +There are a number of improvements and enhancements that can reduce the cycle +time in practice: + +- reduce the amount of time required to build the docker image used in these + tests. Without the dependency on CGo, the tendermint binaries could be + (cross) compiled outside of the docker container and then injected into + them, which would take better advantage of docker's native caching, + although, without the dependency on CGo there would be no hard requirement + for the e2e tests to use docker. + +- support test parallelism. Because of the way the testnets are orchestrated + a single system can really only run one network at a time. For executions + (local or remote) with more resources, there's no reason to run a few + networks in parallel to reduce the feedback time. + +- prune testnet configurations that are unlikely to provide good signal, to + shorten the time to feedback. + +- apply some kind of tiered approach to test execution, to improve the + legibility of the test result. For example order tests by the dependency of + their features, or run test networks without perturbations before running + that configuration with perturbations, to be able to isolate the impact of + specific features. + +- orchestrate the test harness directly from go test rather than via a special + harness and shell scripts so e2e tests may more naively fit into developers + existing workflows. + +Many of these improvements, particularly, reducing the build time will also +reduce the time to get feedback during automated builds. + +Deeper Insights +~~~~~~~~~~~~~~~ + +When a test network fails, it's incredibly difficult to understand _why_ the +network failed, as the current system provides very little insight into the +system outside of the process logs. When a test network stalls or fails +developers should be able to quickly and easily get a sense of the state of +the network and all nodes. + +Improvements in persuit of this goal, include functionality that would help +node operators in production environments by improving the quality and utility +of the logging messages and other reported metrics, but also provide some +tools to collect and aggregate this data for developers in the context of test +networks. + +- Interleave messages from all nodes in the network to be able to correlate + events during the test run. + +- Collect structured metrics of the system operation (CPU/MEM/IO) during the + test run, as well as from each tendermint/application process. + +- Build (simple) tools to be able to render and summarize the data collected + during the test run to answer basic questions about test outcome. + +Flexible Assertions +~~~~~~~~~~~~~~~~~~~ + +Currently, all assertions run for every test network, which makes the +assertions pretty bland, and the framework primarily useful as a smoke-test +framework, but it might be useful to be able to write and run different +tests for different configurations. This could allow us to test outside of the +happy-path. + +In general our existing assertions occupy a fraction of the total test time, +so the relative cost of adding a few extra test assertions would be of limited +cost, and could help build confidence. + +Additional Kinds of Testing +~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The existing e2e suite, exercises networks of nodes that have homogeneous +tendermint version, stable configuration, that are expected to make +progress. There are many other possible test configurations that may be +interesting to engage with. These could include dimensions, such as: + +- Multi-version testing to exercise our compatibility guarantees for networks + that might have different tendermint versions. + +- As a flavor or mult-version testing, include upgrade testing, to build + confidence in migration code and procedures. + +- Additional test applications, particularly practical-type applciations + including some that use gaiad and/or the cosmos-sdk. Test-only applications + that simulate other kinds of applications (e.g. variable application + operation latency.) + +- Tests of "non-viable" configurations that ensure that forbidden combinations + lead to halts. + +References +---------- + +- `ADR 66: End-to-End Testing <../architecture/adr-66-e2e-testing.md>`_ diff --git a/docs/tendermint-core/block-sync.md b/docs/tendermint-core/block-sync.md index 9d362424f..43e849fcc 100644 --- a/docs/tendermint-core/block-sync.md +++ b/docs/tendermint-core/block-sync.md @@ -17,9 +17,9 @@ consensus gossip protocol. ## Using Block Sync -To support faster syncing, Tendermint offers a `fast-sync` mode, which +To support faster syncing, Tendermint offers a `blocksync` mode, which is enabled by default, and can be toggled in the `config.toml` or via -`--fast_sync=false`. +`--blocksync.enable=false`. In this mode, the Tendermint daemon will sync hundreds of times faster than if it used the real-time consensus process. Once caught up, the @@ -29,18 +29,23 @@ has at least one peer and it's height is at least as high as the max reported peer height. See [the IsCaughtUp method](https://github.com/tendermint/tendermint/blob/b467515719e686e4678e6da4e102f32a491b85a0/blockchain/pool.go#L128). -Note: There are two versions of Block Sync. We recommend using v0 as v2 is still in beta. +Note: There are multiple versions of Block Sync. Please use v0 as the other versions are no longer supported. If you would like to use a different version you can do so by changing the version in the `config.toml`: ```toml ####################################################### ### Block Sync Configuration Connections ### ####################################################### -[fastsync] +[blocksync] + +# If this node is many blocks behind the tip of the chain, BlockSync +# allows them to catchup quickly by downloading blocks in parallel +# and verifying their commits +enable = true # Block Sync version to use: -# 1) "v0" (default) - the legacy Block Sync implementation -# 2) "v2" - complete redesign of v0, optimized for testability & readability +# 1) "v0" (default) - the standard Block Sync implementation +# 2) "v2" - DEPRECATED, please use v0 version = "v0" ``` @@ -55,4 +60,4 @@ the network best height, it will switches to the state sync mechanism and then e another event for exposing the fast-sync `complete` status and the state `height`. The user can query the events by subscribing `EventQueryBlockSyncStatus` -Please check [types](https://pkg.go.dev/github.com/tendermint/tendermint/types?utm_source=godoc#pkg-constants) for the details. \ No newline at end of file +Please check [types](https://pkg.go.dev/github.com/tendermint/tendermint/types?utm_source=godoc#pkg-constants) for the details. diff --git a/docs/tendermint-core/using-tendermint.md b/docs/tendermint-core/using-tendermint.md index bbcdd0370..4b8a26bad 100644 --- a/docs/tendermint-core/using-tendermint.md +++ b/docs/tendermint-core/using-tendermint.md @@ -185,51 +185,65 @@ the argument name and use `_` as a placeholder. ### Formatting -The following nuances when sending/formatting transactions should be -taken into account: +When sending transactions to the RPC interface, the following formatting rules +must be followed: -With `GET`: +Using `GET` (with parameters in the URL): -To send a UTF8 string byte array, quote the value of the tx parameter: +To send a UTF8 string as transaction data, enclose the value of the `tx` +parameter in double quotes: ```sh curl 'http://localhost:26657/broadcast_tx_commit?tx="hello"' ``` -which sends a 5 byte transaction: "h e l l o" \[68 65 6c 6c 6f\]. +which sends a 5-byte transaction: "h e l l o" \[68 65 6c 6c 6f\]. -Note the URL must be wrapped with single quotes, else bash will ignore -the double quotes. To avoid the single quotes, escape the double quotes: +Note that the URL in this example is enclosed in single quotes to prevent the +shell from interpreting the double quotes. Alternatively, you may escape the +double quotes with backslashes: ```sh curl http://localhost:26657/broadcast_tx_commit?tx=\"hello\" ``` -Using a special character: +The double-quoted format works with for multibyte characters, as long as they +are valid UTF8, for example: ```sh curl 'http://localhost:26657/broadcast_tx_commit?tx="€5"' ``` -sends a 4 byte transaction: "€5" (UTF8) \[e2 82 ac 35\]. +sends a 4-byte transaction: "€5" (UTF8) \[e2 82 ac 35\]. -To send as raw hex, omit quotes AND prefix the hex string with `0x`: +Arbitrary (non-UTF8) transaction data may also be encoded as a string of +hexadecimal digits (2 digits per byte). To do this, omit the quotation marks +and prefix the hex string with `0x`: ```sh -curl http://localhost:26657/broadcast_tx_commit?tx=0x01020304 +curl http://localhost:26657/broadcast_tx_commit?tx=0x68656C6C6F ``` -which sends a 4 byte transaction: \[01 02 03 04\]. +which sends the 5-byte transaction: \[68 65 6c 6c 6f\]. -With `POST` (using `json`), the raw hex must be `base64` encoded: +Using `POST` (with parameters in JSON), the transaction data are sent as a JSON +string in base64 encoding: ```sh -curl --data-binary '{"jsonrpc":"2.0","id":"anything","method":"broadcast_tx_commit","params": {"tx": "AQIDBA=="}}' -H 'content-type:text/plain;' http://localhost:26657 +curl http://localhost:26657 -H 'Content-Type: application/json' --data-binary '{ + "jsonrpc": "2.0", + "id": "anything", + "method": "broadcast_tx_commit", + "params": { + "tx": "aGVsbG8=" + } +}' ``` -which sends the same 4 byte transaction: \[01 02 03 04\]. +which sends the same 5-byte transaction: \[68 65 6c 6c 6f\]. -Note that raw hex cannot be used in `POST` transactions. +Note that the hexadecimal encoding of transaction data is _not_ supported in +JSON (`POST`) requests. ## Reset diff --git a/go.mod b/go.mod index 66b982309..d94b5c4a8 100644 --- a/go.mod +++ b/go.mod @@ -4,7 +4,6 @@ go 1.16 require ( github.com/BurntSushi/toml v0.4.1 - github.com/Masterminds/squirrel v1.5.0 github.com/Workiva/go-datastructures v1.0.53 github.com/adlio/schema v1.1.13 github.com/btcsuite/btcd v0.22.0-beta @@ -13,13 +12,13 @@ require ( github.com/go-kit/kit v0.11.0 github.com/gogo/protobuf v1.3.2 github.com/golang/protobuf v1.5.2 - github.com/golangci/golangci-lint v1.42.0 + github.com/golangci/golangci-lint v1.42.1 github.com/google/orderedcode v0.0.1 github.com/google/uuid v1.3.0 github.com/gorilla/websocket v1.4.2 github.com/grpc-ecosystem/go-grpc-middleware v1.3.0 github.com/grpc-ecosystem/go-grpc-prometheus v1.2.0 - github.com/lib/pq v1.10.2 + github.com/lib/pq v1.10.3 github.com/libp2p/go-buffer-pool v0.0.2 github.com/minio/highwayhash v1.0.2 github.com/mroth/weightedrand v0.4.1 @@ -28,7 +27,7 @@ require ( github.com/prometheus/client_golang v1.11.0 github.com/rcrowley/go-metrics v0.0.0-20200313005456-10cdbea86bc0 github.com/rs/cors v1.8.0 - github.com/rs/zerolog v1.24.0 + github.com/rs/zerolog v1.25.0 github.com/sasha-s/go-deadlock v0.2.1-0.20190427202633-1595213edefa github.com/snikch/goodman v0.0.0-20171125024755-10e37e294daa github.com/spf13/cobra v1.2.1 diff --git a/go.sum b/go.sum index 32619bf08..8028907fc 100644 --- a/go.sum +++ b/go.sum @@ -44,8 +44,8 @@ cloud.google.com/go/storage v1.8.0/go.mod h1:Wv1Oy7z6Yz3DshWRJFhqM/UCfaWIRTdp0RX cloud.google.com/go/storage v1.10.0/go.mod h1:FLPqc6j+Ki4BU591ie1oL6qBQGu2Bl/tZ9ullr3+Kg0= contrib.go.opencensus.io/exporter/stackdriver v0.13.4/go.mod h1:aXENhDJ1Y4lIg4EUaVTwzvYETVNZk10Pu26tevFKLUc= dmitri.shuralyov.com/gpu/mtl v0.0.0-20190408044501-666a987793e9/go.mod h1:H6x//7gZCb22OMCxBHrMx7a5I7Hp++hsVxbQ4BYO7hU= -github.com/Antonboom/errname v0.1.3 h1:qKV8gSzPzBqrG/q0dgraZXJCymWt6KuD9+Y7K7xtzN8= -github.com/Antonboom/errname v0.1.3/go.mod h1:jRXo3m0E0EuCnK3wbsSVH3X55Z4iTDLl6ZfCxwFj4TM= +github.com/Antonboom/errname v0.1.4 h1:lGSlI42Gm4bI1e+IITtXJXvxFM8N7naWimVFKcb0McY= +github.com/Antonboom/errname v0.1.4/go.mod h1:jRXo3m0E0EuCnK3wbsSVH3X55Z4iTDLl6ZfCxwFj4TM= github.com/Azure/go-ansiterm v0.0.0-20170929234023-d6e3b3328b78 h1:w+iIsaOQNcT7OZ575w+acHgRric5iCyQh+xv+KJ4HB8= github.com/Azure/go-ansiterm v0.0.0-20170929234023-d6e3b3328b78/go.mod h1:LmzpDX56iTiv29bbRTIsUNlaFfuhWRQBWjQdVyAevI8= github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU= @@ -63,8 +63,6 @@ github.com/Masterminds/semver v1.5.0 h1:H65muMkzWKEuNDnfl9d70GUjFniHKHRbFPGBuZ3Q github.com/Masterminds/semver v1.5.0/go.mod h1:MB6lktGJrhw8PrUyiEoblNEGEQ+RzHPF078ddwwvV3Y= github.com/Masterminds/sprig v2.15.0+incompatible/go.mod h1:y6hNFY5UBTIWBxnzTeuNhlNS5hqE0NB0E6fgfo2Br3o= github.com/Masterminds/sprig v2.22.0+incompatible/go.mod h1:y6hNFY5UBTIWBxnzTeuNhlNS5hqE0NB0E6fgfo2Br3o= -github.com/Masterminds/squirrel v1.5.0 h1:JukIZisrUXadA9pl3rMkjhiamxiB0cXiu+HGp/Y8cY8= -github.com/Masterminds/squirrel v1.5.0/go.mod h1:NNaOrjSoIDfDA40n7sr2tPNZRfjzjA400rg+riTZj10= github.com/Microsoft/go-winio v0.4.14 h1:+hMXMk01us9KgxGb7ftKQt2Xpf5hH/yky+TDA+qxleU= github.com/Microsoft/go-winio v0.4.14/go.mod h1:qXqCSQ3Xa7+6tgxaGTIe4Kpcdsi+P8jBhyzoq1bpyYA= github.com/Nvveen/Gotty v0.0.0-20120604004816-cd527374f1e5 h1:TngWCqHvy9oXAN6lEVMRuU21PR1EtLVZJmdB18Gu3Rw= @@ -343,8 +341,8 @@ github.com/golangci/go-misc v0.0.0-20180628070357-927a3d87b613 h1:9kfjN3AdxcbsZB github.com/golangci/go-misc v0.0.0-20180628070357-927a3d87b613/go.mod h1:SyvUF2NxV+sN8upjjeVYr5W7tyxaT1JVtvhKhOn2ii8= github.com/golangci/gofmt v0.0.0-20190930125516-244bba706f1a h1:iR3fYXUjHCR97qWS8ch1y9zPNsgXThGwjKPrYfqMPks= github.com/golangci/gofmt v0.0.0-20190930125516-244bba706f1a/go.mod h1:9qCChq59u/eW8im404Q2WWTrnBUQKjpNYKMbU4M7EFU= -github.com/golangci/golangci-lint v1.42.0 h1:hqf1zo6zY3GKGjjBk3ttdH22tGwF6ZRpk6j6xyJmE8I= -github.com/golangci/golangci-lint v1.42.0/go.mod h1:wgkGQnU9lOUFvTFo5QBSOvaSSddEV21Z1zYkJSbppZA= +github.com/golangci/golangci-lint v1.42.1 h1:nC4WyrbdnNdohDVUoNKjy/4N4FTM1gCFaVeXecy6vzM= +github.com/golangci/golangci-lint v1.42.1/go.mod h1:MuInrVlgg2jq4do6XI1jbkErbVHVbwdrLLtGv6p2wPI= github.com/golangci/lint-1 v0.0.0-20191013205115-297bf364a8e0 h1:MfyDlzVjl1hoaPzPD4Gpb/QgoRfSBR0jdhwGyAWwMSA= github.com/golangci/lint-1 v0.0.0-20191013205115-297bf364a8e0/go.mod h1:66R6K6P6VWk9I95jvqGxkqJxVWGFy9XlDwLwVz1RCFg= github.com/golangci/maligned v0.0.0-20180506175553-b1d89398deca h1:kNY3/svz5T29MYHubXix4aDDuE3RWHkPvopM/EDv/MA= @@ -548,10 +546,6 @@ github.com/kunwardeep/paralleltest v1.0.2/go.mod h1:ZPqNm1fVHPllh5LPVujzbVz1JN2G github.com/kylelemons/godebug v1.1.0/go.mod h1:9/0rRGxNHcop5bhtWyNeEfOS8JIWk580+fNqagV/RAw= github.com/kyoh86/exportloopref v0.1.8 h1:5Ry/at+eFdkX9Vsdw3qU4YkvGtzuVfzT4X7S77LoN/M= github.com/kyoh86/exportloopref v0.1.8/go.mod h1:1tUcJeiioIs7VWe5gcOObrux3lb66+sBqGZrRkMwPgg= -github.com/lann/builder v0.0.0-20180802200727-47ae307949d0 h1:SOEGU9fKiNWd/HOJuq6+3iTQz8KNCLtVX6idSoTLdUw= -github.com/lann/builder v0.0.0-20180802200727-47ae307949d0/go.mod h1:dXGbAdH5GtBTC4WfIxhKZfyBF/HBFgRZSWwZ9g/He9o= -github.com/lann/ps v0.0.0-20150810152359-62de8c46ede0 h1:P6pPBnrTSX3DEVR4fDembhRWSsG5rVo6hYhAB/ADZrk= -github.com/lann/ps v0.0.0-20150810152359-62de8c46ede0/go.mod h1:vmVJ0l/dxyfGW6FmdpVm2joNMFikkuWg0EoCKLGUMNw= github.com/ldez/gomoddirectives v0.2.2 h1:p9/sXuNFArS2RLc+UpYZSI4KQwGMEDWC/LbtF5OPFVg= github.com/ldez/gomoddirectives v0.2.2/go.mod h1:cpgBogWITnCfRq2qGoDkKMEVSaarhdBr6g8G04uz6d0= github.com/ldez/tagliatelle v0.2.0 h1:693V8Bf1NdShJ8eu/s84QySA0J2VWBanVBa2WwXD/Wk= @@ -562,8 +556,9 @@ github.com/lib/pq v1.0.0/go.mod h1:5WUZQaWbwv1U+lTReE5YruASi9Al49XbQIvNi/34Woo= github.com/lib/pq v1.2.0/go.mod h1:5WUZQaWbwv1U+lTReE5YruASi9Al49XbQIvNi/34Woo= github.com/lib/pq v1.8.0/go.mod h1:AlVN5x4E4T544tWzH6hKfbfQvm3HdbOxrmggDNAPY9o= github.com/lib/pq v1.9.0/go.mod h1:AlVN5x4E4T544tWzH6hKfbfQvm3HdbOxrmggDNAPY9o= -github.com/lib/pq v1.10.2 h1:AqzbZs4ZoCBp+GtejcpCpcxM3zlSMx29dXbUSeVtJb8= github.com/lib/pq v1.10.2/go.mod h1:AlVN5x4E4T544tWzH6hKfbfQvm3HdbOxrmggDNAPY9o= +github.com/lib/pq v1.10.3 h1:v9QZf2Sn6AmjXtQeFpdoq/eaNtYP6IN+7lcrygsIAtg= +github.com/lib/pq v1.10.3/go.mod h1:AlVN5x4E4T544tWzH6hKfbfQvm3HdbOxrmggDNAPY9o= github.com/libp2p/go-buffer-pool v0.0.2 h1:QNK2iAFa8gjAe1SPz6mHSMuCcjs+X1wlHzeOSqcmlfs= github.com/libp2p/go-buffer-pool v0.0.2/go.mod h1:MvaB6xw5vOrDl8rYZGLFdKAuk/hRoRZd1Vi32+RXyFM= github.com/logrusorgru/aurora v0.0.0-20181002194514-a7b3b318ed4e/go.mod h1:7rIyQOR62GCctdiQpZ/zOJlFyk6y+94wXzv6RNZgaR4= @@ -601,8 +596,8 @@ github.com/mbilski/exhaustivestruct v1.2.0 h1:wCBmUnSYufAHO6J4AVWY6ff+oxWxsVFrwg github.com/mbilski/exhaustivestruct v1.2.0/go.mod h1:OeTBVxQWoEmB2J2JCHmXWPJ0aksxSUOUy+nvtVEfzXc= github.com/mgechev/dots v0.0.0-20190921121421-c36f7dcfbb81 h1:QASJXOGm2RZ5Ardbc86qNFvby9AqkLDibfChMtAg5QM= github.com/mgechev/dots v0.0.0-20190921121421-c36f7dcfbb81/go.mod h1:KQ7+USdGKfpPjXk4Ga+5XxQM4Lm4e3gAogrreFAYpOg= -github.com/mgechev/revive v1.1.0 h1:TvabpsolbtlzZTyJcgMRN38MHrgi8C0DhmGE5dhscGY= -github.com/mgechev/revive v1.1.0/go.mod h1:PKqk4L74K6wVNwY2b6fr+9Qqr/3hIsHVfZCJdbvozrY= +github.com/mgechev/revive v1.1.1 h1:mkXNHP14Y6tfq+ocnQaiKEtgJDM41yaoyQq4qn6TD/4= +github.com/mgechev/revive v1.1.1/go.mod h1:PKqk4L74K6wVNwY2b6fr+9Qqr/3hIsHVfZCJdbvozrY= github.com/miekg/dns v1.0.14/go.mod h1:W1PPwlIAgtquWBMBEV9nkV9Cazfe8ScdGz/Lj7v3Nrg= github.com/miekg/dns v1.1.26/go.mod h1:bPDLeHnStXmXAq1m/Ch/hvfNHr14JKNPMBo3VZKjuso= github.com/miekg/dns v1.1.35/go.mod h1:KNUDUusw/aVsxyTYZM1oqvCicbwhgbNgztCETuNZ7xM= @@ -719,8 +714,8 @@ github.com/pkg/sftp v1.10.1/go.mod h1:lYOWFsE0bwd1+KfKJaKeuokY15vzFx25BLbzYYoAxZ github.com/pmezard/go-difflib v0.0.0-20151028094244-d8ed2627bdf0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= -github.com/polyfloyd/go-errorlint v0.0.0-20210510181950-ab96adb96fea h1:Sk6Xawg57ZkjXmFYD1xCHSKN6FtYM+km51MM7Lveyyc= -github.com/polyfloyd/go-errorlint v0.0.0-20210510181950-ab96adb96fea/go.mod h1:wi9BfjxjF/bwiZ701TzmfKu6UKC357IOAtNr0Td0Lvw= +github.com/polyfloyd/go-errorlint v0.0.0-20210722154253-910bb7978349 h1:Kq/3kL0k033ds3tyez5lFPrfQ74fNJ+OqCclRipubwA= +github.com/polyfloyd/go-errorlint v0.0.0-20210722154253-910bb7978349/go.mod h1:wi9BfjxjF/bwiZ701TzmfKu6UKC357IOAtNr0Td0Lvw= github.com/posener/complete v1.1.1/go.mod h1:em0nMJCgc9GFtwrmVmEMR/ZL6WyhyjMBndrE9hABlRI= github.com/posener/complete v1.2.3/go.mod h1:WZIdtGGp+qx0sLrYKtIRAruyNpv6hFCicSgv7Sy7s/s= github.com/prometheus/client_golang v0.9.1/go.mod h1:7SWBe2y4D6OKWSNQJUaRYU/AaXPKyh/dDVn+NZz0KFw= @@ -772,8 +767,8 @@ github.com/rs/cors v1.8.0/go.mod h1:EBwu+T5AvHOcXwvZIkQFjUN6s8Czyqw12GL/Y0tUyRM= github.com/rs/xid v1.2.1/go.mod h1:+uKXf+4Djp6Md1KODXJxgGQPKngRmWyn10oCKFzNHOQ= github.com/rs/xid v1.3.0/go.mod h1:trrq9SKmegXys3aeAKXMUTdJsYXVwGY3RLcfgqegfbg= github.com/rs/zerolog v1.18.0/go.mod h1:9nvC1axdVrAHcu/s9taAVfBuIdTZLVQmKQyvrUjF5+I= -github.com/rs/zerolog v1.24.0 h1:76ivFxmVSRs1u2wUwJVg5VZDYQgeH1JpoS6ndgr9Wy8= -github.com/rs/zerolog v1.24.0/go.mod h1:7KHcEGe0QZPOm2IE4Kpb5rTh6n1h2hIgS5OOnu1rUaI= +github.com/rs/zerolog v1.25.0 h1:Rj7XygbUHKUlDPcVdoLyR91fJBsduXj5fRxyqIQj/II= +github.com/rs/zerolog v1.25.0/go.mod h1:7KHcEGe0QZPOm2IE4Kpb5rTh6n1h2hIgS5OOnu1rUaI= github.com/russross/blackfriday v1.5.2/go.mod h1:JO/DiYxRf+HjHt06OyowR9PTA263kcR/rfWxYHBV53g= github.com/russross/blackfriday/v2 v2.0.1/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= github.com/ryancurrah/gomodguard v1.2.3 h1:ww2fsjqocGCAFamzvv/b8IsRduuHHeK2MHTcTxZTQX8= @@ -868,8 +863,8 @@ github.com/tecbot/gorocksdb v0.0.0-20191217155057-f0fad39f321c h1:g+WoO5jjkqGAzH github.com/tecbot/gorocksdb v0.0.0-20191217155057-f0fad39f321c/go.mod h1:ahpPrc7HpcfEWDQRZEmnXMzHY03mLDYMCxeDzy46i+8= github.com/tendermint/tm-db v0.6.4 h1:3N2jlnYQkXNQclQwd/eKV/NzlqPlfK21cpRRIx80XXQ= github.com/tendermint/tm-db v0.6.4/go.mod h1:dptYhIpJ2M5kUuenLr+Yyf3zQOv1SgBZcl8/BmWlMBw= -github.com/tetafro/godot v1.4.8 h1:rhuUH+tBrx24yVAr6Ox3/UxcsiUPPJcGhinfLdbdew0= -github.com/tetafro/godot v1.4.8/go.mod h1:LR3CJpxDVGlYOWn3ZZg1PgNZdTUvzsZWu8xaEohUpn8= +github.com/tetafro/godot v1.4.9 h1:wsNd0RuUxISqqudFqchsSsMqsM188DoZVPBeKl87tP0= +github.com/tetafro/godot v1.4.9/go.mod h1:LR3CJpxDVGlYOWn3ZZg1PgNZdTUvzsZWu8xaEohUpn8= github.com/timakin/bodyclose v0.0.0-20200424151742-cb6215831a94 h1:ig99OeTyDwQWhPe2iw9lwfQVF1KB3Q4fpP3X7/2VBG8= github.com/timakin/bodyclose v0.0.0-20200424151742-cb6215831a94/go.mod h1:Qimiffbc6q9tBWlVV6x0P9sat/ao1xEkREYPPj9hphk= github.com/tinylib/msgp v1.1.5/go.mod h1:eQsjooMTnV42mHu917E26IogZ2930nFyBQdofk10Udg= diff --git a/internal/consensus/reactor.go b/internal/consensus/reactor.go index 2b9fa7358..915756488 100644 --- a/internal/consensus/reactor.go +++ b/internal/consensus/reactor.go @@ -1096,7 +1096,7 @@ func (r *Reactor) handleDataMessage(envelope p2p.Envelope, msgI Message) error { } if r.WaitSync() { - logger.Info("ignoring message received during sync", "msg", msgI) + logger.Info("ignoring message received during sync", "msg", fmt.Sprintf("%T", msgI)) return nil } diff --git a/internal/evidence/pool.go b/internal/evidence/pool.go index 8ca97fd17..2a48fe032 100644 --- a/internal/evidence/pool.go +++ b/internal/evidence/pool.go @@ -516,10 +516,13 @@ func (evpool *Pool) processConsensusBuffer(state sm.State) { // Check the height of the conflicting votes and fetch the corresponding time and validator set // to produce the valid evidence - var dve *types.DuplicateVoteEvidence + var ( + dve *types.DuplicateVoteEvidence + err error + ) switch { case voteSet.VoteA.Height == state.LastBlockHeight: - dve = types.NewDuplicateVoteEvidence( + dve, err = types.NewDuplicateVoteEvidence( voteSet.VoteA, voteSet.VoteB, state.LastBlockTime, @@ -527,8 +530,8 @@ func (evpool *Pool) processConsensusBuffer(state sm.State) { ) case voteSet.VoteA.Height < state.LastBlockHeight: - valSet, err := evpool.stateDB.LoadValidators(voteSet.VoteA.Height) - if err != nil { + valSet, dbErr := evpool.stateDB.LoadValidators(voteSet.VoteA.Height) + if dbErr != nil { evpool.logger.Error("failed to load validator set for conflicting votes", "height", voteSet.VoteA.Height, "err", err) continue @@ -538,7 +541,7 @@ func (evpool *Pool) processConsensusBuffer(state sm.State) { evpool.logger.Error("failed to load block time for conflicting votes", "height", voteSet.VoteA.Height) continue } - dve = types.NewDuplicateVoteEvidence( + dve, err = types.NewDuplicateVoteEvidence( voteSet.VoteA, voteSet.VoteB, blockMeta.Header.Time, @@ -554,6 +557,10 @@ func (evpool *Pool) processConsensusBuffer(state sm.State) { "state.LastBlockHeight", state.LastBlockHeight) continue } + if err != nil { + evpool.logger.Error("error in generating evidence from votes", "err", err) + continue + } // check if we already have this evidence if evpool.isPending(dve) { @@ -608,7 +615,7 @@ func prefixToBytes(prefix int64) []byte { } func keyCommitted(evidence types.Evidence) []byte { - var height int64 = evidence.Height() + height := evidence.Height() key, err := orderedcode.Append(nil, prefixCommitted, height, string(evidence.Hash())) if err != nil { panic(err) @@ -617,7 +624,7 @@ func keyCommitted(evidence types.Evidence) []byte { } func keyPending(evidence types.Evidence) []byte { - var height int64 = evidence.Height() + height := evidence.Height() key, err := orderedcode.Append(nil, prefixPending, height, string(evidence.Hash())) if err != nil { panic(err) diff --git a/internal/evidence/reactor_test.go b/internal/evidence/reactor_test.go index b098eb373..1cf995731 100644 --- a/internal/evidence/reactor_test.go +++ b/internal/evidence/reactor_test.go @@ -534,12 +534,13 @@ func TestEvidenceListSerialization(t *testing.T) { valSet := types.NewValidatorSet([]*types.Validator{val}) - dupl := types.NewDuplicateVoteEvidence( + dupl, err := types.NewDuplicateVoteEvidence( exampleVote(1), exampleVote(2), defaultEvidenceTime, valSet, ) + require.NoError(t, err) testCases := map[string]struct { evidenceList []types.Evidence diff --git a/internal/p2p/wdrr_queue.go b/internal/p2p/wdrr_queue.go index b99403be2..1b75ffce8 100644 --- a/internal/p2p/wdrr_queue.go +++ b/internal/p2p/wdrr_queue.go @@ -16,7 +16,7 @@ type wrappedEnvelope struct { size uint } -// assert the WDDR scheduler implements the queue interface at compile-time +// assert the WDRR scheduler implements the queue interface at compile-time var _ queue = (*wdrrScheduler)(nil) // wdrrQueue implements a Weighted Deficit Round Robin (WDRR) scheduling diff --git a/internal/statesync/block_queue_test.go b/internal/statesync/block_queue_test.go index 3a4c71e4e..dc5e2bc82 100644 --- a/internal/statesync/block_queue_test.go +++ b/internal/statesync/block_queue_test.go @@ -274,8 +274,10 @@ loop: } func mockLBResp(t *testing.T, peer types.NodeID, height int64, time time.Time) lightBlockResponse { + vals, pv := factory.RandValidatorSet(3, 10) + _, _, lb := mockLB(t, height, time, factory.MakeBlockID(), vals, pv) return lightBlockResponse{ - block: mockLB(t, height, time, factory.MakeBlockID()), + block: lb, peer: peer, } } diff --git a/internal/statesync/dispatcher.go b/internal/statesync/dispatcher.go index 394b77e38..37010986f 100644 --- a/internal/statesync/dispatcher.go +++ b/internal/statesync/dispatcher.go @@ -5,7 +5,6 @@ import ( "errors" "fmt" "sync" - "time" "github.com/tendermint/tendermint/internal/p2p" "github.com/tendermint/tendermint/light/provider" @@ -17,169 +16,79 @@ import ( var ( errNoConnectedPeers = errors.New("no available peers to dispatch request to") errUnsolicitedResponse = errors.New("unsolicited light block response") - errNoResponse = errors.New("peer failed to respond within timeout") errPeerAlreadyBusy = errors.New("peer is already processing a request") - errDisconnected = errors.New("dispatcher has been disconnected") + errDisconnected = errors.New("dispatcher disconnected") ) -// dispatcher keeps a list of peers and allows concurrent requests for light -// blocks. NOTE: It is not the responsibility of the dispatcher to verify the -// light blocks. -type dispatcher struct { - availablePeers *peerlist - requestCh chan<- p2p.Envelope - timeout time.Duration +// A Dispatcher multiplexes concurrent requests by multiple peers for light blocks. +// Only one request per peer can be sent at a time. Subsequent concurrent requests will +// report an error from the LightBlock method. +// NOTE: It is not the responsibility of the dispatcher to verify the light blocks. +type Dispatcher struct { + // the channel with which to send light block requests on + requestCh chan<- p2p.Envelope + closeCh chan struct{} - mtx sync.Mutex - calls map[types.NodeID]chan *types.LightBlock - running bool + mtx sync.Mutex + // all pending calls that have been dispatched and are awaiting an answer + calls map[types.NodeID]chan *types.LightBlock } -func newDispatcher(requestCh chan<- p2p.Envelope, timeout time.Duration) *dispatcher { - return &dispatcher{ - availablePeers: newPeerList(), - timeout: timeout, - requestCh: requestCh, - calls: make(map[types.NodeID]chan *types.LightBlock), - running: true, +func NewDispatcher(requestCh chan<- p2p.Envelope) *Dispatcher { + return &Dispatcher{ + requestCh: requestCh, + closeCh: make(chan struct{}), + calls: make(map[types.NodeID]chan *types.LightBlock), } } -// LightBlock uses the request channel to fetch a light block from the next peer -// in a list, tracks the call and waits for the reactor to pass along the response -func (d *dispatcher) LightBlock(ctx context.Context, height int64) (*types.LightBlock, types.NodeID, error) { - d.mtx.Lock() - // check to see that the dispatcher is connected to at least one peer - if d.availablePeers.Len() == 0 && len(d.calls) == 0 { - d.mtx.Unlock() - return nil, "", errNoConnectedPeers - } - d.mtx.Unlock() - - // fetch the next peer id in the list and request a light block from that - // peer - peer := d.availablePeers.Pop(ctx) - lb, err := d.lightBlock(ctx, height, peer) - return lb, peer, err -} - -// Providers turns the dispatcher into a set of providers (per peer) which can -// be used by a light client -func (d *dispatcher) Providers(chainID string, timeout time.Duration) []provider.Provider { - d.mtx.Lock() - defer d.mtx.Unlock() - - providers := make([]provider.Provider, d.availablePeers.Len()) - peers := d.availablePeers.Peers() - for index, peer := range peers { - providers[index] = &blockProvider{ - peer: peer, - dispatcher: d, - chainID: chainID, - timeout: timeout, - } - } - return providers -} - -func (d *dispatcher) stop() { - d.mtx.Lock() - defer d.mtx.Unlock() - d.running = false - for peer, call := range d.calls { - close(call) - delete(d.calls, peer) - } -} - -func (d *dispatcher) start() { - d.mtx.Lock() - defer d.mtx.Unlock() - d.running = true -} - -func (d *dispatcher) lightBlock(ctx context.Context, height int64, peer types.NodeID) (*types.LightBlock, error) { +// LightBlock uses the request channel to fetch a light block from a given peer +// tracking, the call and waiting for the reactor to pass back the response. A nil +// LightBlock response is used to signal that the peer doesn't have the requested LightBlock. +func (d *Dispatcher) LightBlock(ctx context.Context, height int64, peer types.NodeID) (*types.LightBlock, error) { // dispatch the request to the peer callCh, err := d.dispatch(peer, height) if err != nil { return nil, err } + // clean up the call after a response is returned + defer func() { + d.mtx.Lock() + defer d.mtx.Unlock() + if call, ok := d.calls[peer]; ok { + delete(d.calls, peer) + close(call) + } + }() + // wait for a response, cancel or timeout select { case resp := <-callCh: return resp, nil case <-ctx.Done(): - d.release(peer) - return nil, nil + return nil, ctx.Err() - case <-time.After(d.timeout): - d.release(peer) - return nil, errNoResponse - } -} - -// respond allows the underlying process which receives requests on the -// requestCh to respond with the respective light block -func (d *dispatcher) respond(lb *proto.LightBlock, peer types.NodeID) error { - d.mtx.Lock() - defer d.mtx.Unlock() - - // check that the response came from a request - answerCh, ok := d.calls[peer] - if !ok { - // this can also happen if the response came in after the timeout - return errUnsolicitedResponse - } - // release the peer after returning the response - defer d.availablePeers.Append(peer) - defer close(answerCh) - defer delete(d.calls, peer) - - if lb == nil { - answerCh <- nil - return nil - } - - block, err := types.LightBlockFromProto(lb) - if err != nil { - fmt.Println("error with converting light block") - return err - } - - answerCh <- block - return nil -} - -func (d *dispatcher) addPeer(peer types.NodeID) { - d.availablePeers.Append(peer) -} - -func (d *dispatcher) removePeer(peer types.NodeID) { - d.mtx.Lock() - defer d.mtx.Unlock() - if _, ok := d.calls[peer]; ok { - delete(d.calls, peer) - } else { - d.availablePeers.Remove(peer) + case <-d.closeCh: + return nil, errDisconnected } } // dispatch takes a peer and allocates it a channel so long as it's not already // busy and the receiving channel is still running. It then dispatches the message -func (d *dispatcher) dispatch(peer types.NodeID, height int64) (chan *types.LightBlock, error) { +func (d *Dispatcher) dispatch(peer types.NodeID, height int64) (chan *types.LightBlock, error) { d.mtx.Lock() defer d.mtx.Unlock() - ch := make(chan *types.LightBlock, 1) - - // check if the dispatcher is running or not - if !d.running { - close(ch) - return ch, errDisconnected + select { + case <-d.closeCh: + return nil, errDisconnected + default: } - // this should happen only if we add the same peer twice (somehow) + ch := make(chan *types.LightBlock, 1) + + // check if a request for the same peer has already been made if _, ok := d.calls[peer]; ok { close(ch) return ch, errPeerAlreadyBusy @@ -193,47 +102,107 @@ func (d *dispatcher) dispatch(peer types.NodeID, height int64) (chan *types.Ligh Height: uint64(height), }, } + return ch, nil } -// release appends the peer back to the list and deletes the allocated call so -// that a new call can be made to that peer -func (d *dispatcher) release(peer types.NodeID) { +// Respond allows the underlying process which receives requests on the +// requestCh to respond with the respective light block. A nil response is used to +// represent that the receiver of the request does not have a light block at that height. +func (d *Dispatcher) Respond(lb *proto.LightBlock, peer types.NodeID) error { d.mtx.Lock() defer d.mtx.Unlock() - if call, ok := d.calls[peer]; ok { - close(call) - delete(d.calls, peer) + + // check that the response came from a request + answerCh, ok := d.calls[peer] + if !ok { + // this can also happen if the response came in after the timeout + return errUnsolicitedResponse } - d.availablePeers.Append(peer) + + // If lb is nil we take that to mean that the peer didn't have the requested light + // block and thus pass on the nil to the caller. + if lb == nil { + answerCh <- nil + return nil + } + + block, err := types.LightBlockFromProto(lb) + if err != nil { + return err + } + + answerCh <- block + return nil +} + +// Close shuts down the dispatcher and cancels any pending calls awaiting responses. +// Peers awaiting responses that have not arrived are delivered a nil block. +func (d *Dispatcher) Close() { + d.mtx.Lock() + defer d.mtx.Unlock() + close(d.closeCh) + for peer, call := range d.calls { + delete(d.calls, peer) + close(call) + } +} + +func (d *Dispatcher) Done() <-chan struct{} { + return d.closeCh } //---------------------------------------------------------------- -// blockProvider is a p2p based light provider which uses a dispatcher connected +// BlockProvider is a p2p based light provider which uses a dispatcher connected // to the state sync reactor to serve light blocks to the light client // // TODO: This should probably be moved over to the light package but as we're // not yet officially supporting p2p light clients we'll leave this here for now. -type blockProvider struct { +// +// NOTE: BlockProvider will return an error with concurrent calls. However, we don't +// need a mutex because a light client (and the backfill process) will never call a +// method more than once at the same time +type BlockProvider struct { peer types.NodeID chainID string - timeout time.Duration - dispatcher *dispatcher + dispatcher *Dispatcher } -func (p *blockProvider) LightBlock(ctx context.Context, height int64) (*types.LightBlock, error) { - // FIXME: The provider doesn't know if the dispatcher is still connected to - // that peer. If the connection is dropped for whatever reason the - // dispatcher needs to be able to relay this back to the provider so it can - // return ErrConnectionClosed instead of ErrNoResponse - ctx, cancel := context.WithTimeout(ctx, p.timeout) - defer cancel() - lb, _ := p.dispatcher.lightBlock(ctx, height, p.peer) - if lb == nil { - return nil, provider.ErrNoResponse +// Creates a block provider which implements the light client Provider interface. +func NewBlockProvider(peer types.NodeID, chainID string, dispatcher *Dispatcher) *BlockProvider { + return &BlockProvider{ + peer: peer, + chainID: chainID, + dispatcher: dispatcher, + } +} + +// LightBlock fetches a light block from the peer at a specified height returning either a +// light block or an appropriate error. +func (p *BlockProvider) LightBlock(ctx context.Context, height int64) (*types.LightBlock, error) { + lb, err := p.dispatcher.LightBlock(ctx, height, p.peer) + switch err { + case nil: + if lb == nil { + return nil, provider.ErrLightBlockNotFound + } + case context.DeadlineExceeded, context.Canceled: + return nil, err + case errPeerAlreadyBusy: + return nil, provider.ErrLightBlockNotFound + default: + return nil, provider.ErrUnreliableProvider{Reason: err.Error()} } + // check that the height requested is the same one returned + if lb.Height != height { + return nil, provider.ErrBadLightBlock{ + Reason: fmt.Errorf("expected height %d, got height %d", height, lb.Height), + } + } + + // perform basic validation if err := lb.ValidateBasic(p.chainID); err != nil { return nil, provider.ErrBadLightBlock{Reason: err} } @@ -245,37 +214,37 @@ func (p *blockProvider) LightBlock(ctx context.Context, height int64) (*types.Li // attacks. This is a no op as there currently isn't a way to wire this up to // the evidence reactor (we should endeavor to do this in the future but for now // it's not critical for backwards verification) -func (p *blockProvider) ReportEvidence(ctx context.Context, ev types.Evidence) error { +func (p *BlockProvider) ReportEvidence(ctx context.Context, ev types.Evidence) error { return nil } // String implements stringer interface -func (p *blockProvider) String() string { return string(p.peer) } +func (p *BlockProvider) String() string { return string(p.peer) } //---------------------------------------------------------------- // peerList is a rolling list of peers. This is used to distribute the load of // retrieving blocks over all the peers the reactor is connected to -type peerlist struct { +type peerList struct { mtx sync.Mutex peers []types.NodeID waiting []chan types.NodeID } -func newPeerList() *peerlist { - return &peerlist{ +func newPeerList() *peerList { + return &peerList{ peers: make([]types.NodeID, 0), waiting: make([]chan types.NodeID, 0), } } -func (l *peerlist) Len() int { +func (l *peerList) Len() int { l.mtx.Lock() defer l.mtx.Unlock() return len(l.peers) } -func (l *peerlist) Pop(ctx context.Context) types.NodeID { +func (l *peerList) Pop(ctx context.Context) types.NodeID { l.mtx.Lock() if len(l.peers) == 0 { // if we don't have any peers in the list we block until a peer is @@ -299,7 +268,7 @@ func (l *peerlist) Pop(ctx context.Context) types.NodeID { return peer } -func (l *peerlist) Append(peer types.NodeID) { +func (l *peerList) Append(peer types.NodeID) { l.mtx.Lock() defer l.mtx.Unlock() if len(l.waiting) > 0 { @@ -312,7 +281,7 @@ func (l *peerlist) Append(peer types.NodeID) { } } -func (l *peerlist) Remove(peer types.NodeID) { +func (l *peerList) Remove(peer types.NodeID) { l.mtx.Lock() defer l.mtx.Unlock() for i, p := range l.peers { @@ -323,7 +292,7 @@ func (l *peerlist) Remove(peer types.NodeID) { } } -func (l *peerlist) Peers() []types.NodeID { +func (l *peerList) All() []types.NodeID { l.mtx.Lock() defer l.mtx.Unlock() return l.peers diff --git a/internal/statesync/dispatcher_test.go b/internal/statesync/dispatcher_test.go index 469630894..e5a6a85cd 100644 --- a/internal/statesync/dispatcher_test.go +++ b/internal/statesync/dispatcher_test.go @@ -13,145 +13,102 @@ import ( "github.com/stretchr/testify/require" "github.com/tendermint/tendermint/internal/p2p" + "github.com/tendermint/tendermint/internal/test/factory" ssproto "github.com/tendermint/tendermint/proto/tendermint/statesync" "github.com/tendermint/tendermint/types" ) func TestDispatcherBasic(t *testing.T) { t.Cleanup(leaktest.Check(t)) + const numPeers = 5 ch := make(chan p2p.Envelope, 100) closeCh := make(chan struct{}) defer close(closeCh) - d := newDispatcher(ch, 1*time.Second) - + d := NewDispatcher(ch) go handleRequests(t, d, ch, closeCh) - peers := createPeerSet(5) - for _, peer := range peers { - d.addPeer(peer) - } - + peers := createPeerSet(numPeers) wg := sync.WaitGroup{} // make a bunch of async requests and require that the correct responses are // given - for i := 1; i < 10; i++ { + for i := 0; i < numPeers; i++ { wg.Add(1) go func(height int64) { defer wg.Done() - lb, peer, err := d.LightBlock(context.Background(), height) + lb, err := d.LightBlock(context.Background(), height, peers[height-1]) require.NoError(t, err) require.NotNil(t, lb) require.Equal(t, lb.Height, height) - require.Contains(t, peers, peer) - }(int64(i)) + }(int64(i + 1)) } wg.Wait() + + // assert that all calls were responded to + assert.Empty(t, d.calls) } func TestDispatcherReturnsNoBlock(t *testing.T) { t.Cleanup(leaktest.Check(t)) ch := make(chan p2p.Envelope, 100) - d := newDispatcher(ch, 1*time.Second) - peerFromSet := createPeerSet(1)[0] - d.addPeer(peerFromSet) + d := NewDispatcher(ch) doneCh := make(chan struct{}) + peer := factory.NodeID("a") go func() { <-ch - require.NoError(t, d.respond(nil, peerFromSet)) + require.NoError(t, d.Respond(nil, peer)) close(doneCh) }() - lb, peerResult, err := d.LightBlock(context.Background(), 1) + lb, err := d.LightBlock(context.Background(), 1, peer) <-doneCh require.Nil(t, lb) require.Nil(t, err) - require.Equal(t, peerFromSet, peerResult) } -func TestDispatcherErrorsWhenNoPeers(t *testing.T) { +func TestDispatcherTimeOutWaitingOnLightBlock(t *testing.T) { t.Cleanup(leaktest.Check(t)) ch := make(chan p2p.Envelope, 100) - d := newDispatcher(ch, 1*time.Second) + d := NewDispatcher(ch) + peer := factory.NodeID("a") - lb, peerResult, err := d.LightBlock(context.Background(), 1) + ctx, cancelFunc := context.WithTimeout(context.Background(), 10*time.Millisecond) + defer cancelFunc() + lb, err := d.LightBlock(ctx, 1, peer) + + require.Error(t, err) + require.Equal(t, context.DeadlineExceeded, err) require.Nil(t, lb) - require.Empty(t, peerResult) - require.Equal(t, errNoConnectedPeers, err) -} - -func TestDispatcherReturnsBlockOncePeerAvailable(t *testing.T) { - t.Cleanup(leaktest.Check(t)) - dispatcherRequestCh := make(chan p2p.Envelope, 100) - d := newDispatcher(dispatcherRequestCh, 1*time.Second) - peerFromSet := createPeerSet(1)[0] - d.addPeer(peerFromSet) - ctx := context.Background() - wrapped, cancelFunc := context.WithCancel(ctx) - - doneCh := make(chan struct{}) - go func() { - lb, peerResult, err := d.LightBlock(wrapped, 1) - require.Nil(t, lb) - require.Equal(t, peerFromSet, peerResult) - require.Nil(t, err) - - // calls to dispatcher.Lightblock write into the dispatcher's requestCh. - // we read from the requestCh here to unblock the requestCh for future - // calls. - <-dispatcherRequestCh - close(doneCh) - }() - cancelFunc() - <-doneCh - - go func() { - <-dispatcherRequestCh - lb := &types.LightBlock{} - asProto, err := lb.ToProto() - require.Nil(t, err) - err = d.respond(asProto, peerFromSet) - require.Nil(t, err) - }() - - lb, peerResult, err := d.LightBlock(context.Background(), 1) - - require.NotNil(t, lb) - require.Equal(t, peerFromSet, peerResult) - require.Nil(t, err) } func TestDispatcherProviders(t *testing.T) { t.Cleanup(leaktest.Check(t)) ch := make(chan p2p.Envelope, 100) - chainID := "state-sync-test" + chainID := "test-chain" closeCh := make(chan struct{}) defer close(closeCh) - d := newDispatcher(ch, 1*time.Second) - + d := NewDispatcher(ch) go handleRequests(t, d, ch, closeCh) peers := createPeerSet(5) - for _, peer := range peers { - d.addPeer(peer) + providers := make([]*BlockProvider, len(peers)) + for idx, peer := range peers { + providers[idx] = NewBlockProvider(peer, chainID, d) } - - providers := d.Providers(chainID, 5*time.Second) require.Len(t, providers, 5) + for i, p := range providers { - bp, ok := p.(*blockProvider) - require.True(t, ok) - assert.Equal(t, bp.String(), string(peers[i])) + assert.Equal(t, string(peers[i]), p.String(), i) lb, err := p.LightBlock(context.Background(), 10) - assert.Error(t, err) - assert.Nil(t, lb) + assert.NoError(t, err) + assert.NotNil(t, lb) } } @@ -166,7 +123,7 @@ func TestPeerListBasic(t *testing.T) { peerList.Append(peer) } - for idx, peer := range peerList.Peers() { + for idx, peer := range peerList.All() { assert.Equal(t, peer, peerSet[idx]) } @@ -178,13 +135,22 @@ func TestPeerListBasic(t *testing.T) { } assert.Equal(t, half, peerList.Len()) + // removing a peer that doesn't exist should not change the list peerList.Remove(types.NodeID("lp")) assert.Equal(t, half, peerList.Len()) + // removing a peer that exists should decrease the list size by one peerList.Remove(peerSet[half]) - half++ - assert.Equal(t, peerSet[half], peerList.Pop(ctx)) + assert.Equal(t, numPeers-half-1, peerList.Len()) + // popping the next peer should work as expected + assert.Equal(t, peerSet[half+1], peerList.Pop(ctx)) + assert.Equal(t, numPeers-half-2, peerList.Len()) + + // append the two peers back + peerList.Append(peerSet[half]) + peerList.Append(peerSet[half+1]) + assert.Equal(t, half, peerList.Len()) } func TestPeerListBlocksWhenEmpty(t *testing.T) { @@ -277,9 +243,28 @@ func TestPeerListConcurrent(t *testing.T) { } } +func TestPeerListRemove(t *testing.T) { + peerList := newPeerList() + numPeers := 10 + + peerSet := createPeerSet(numPeers) + for _, peer := range peerSet { + peerList.Append(peer) + } + + for _, peer := range peerSet { + peerList.Remove(peer) + for _, p := range peerList.All() { + require.NotEqual(t, p, peer) + } + numPeers-- + require.Equal(t, numPeers, peerList.Len()) + } +} + // handleRequests is a helper function usually run in a separate go routine to // imitate the expected responses of the reactor wired to the dispatcher -func handleRequests(t *testing.T, d *dispatcher, ch chan p2p.Envelope, closeCh chan struct{}) { +func handleRequests(t *testing.T, d *Dispatcher, ch chan p2p.Envelope, closeCh chan struct{}) { t.Helper() for { select { @@ -288,7 +273,7 @@ func handleRequests(t *testing.T, d *dispatcher, ch chan p2p.Envelope, closeCh c peer := request.To resp := mockLBResp(t, peer, int64(height), time.Now()) block, _ := resp.block.ToProto() - require.NoError(t, d.respond(block, resp.peer)) + require.NoError(t, d.Respond(block, resp.peer)) case <-closeCh: return } diff --git a/internal/statesync/mock_sync_reactor.go b/internal/statesync/mock_sync_reactor.go deleted file mode 100644 index 6688ce4d2..000000000 --- a/internal/statesync/mock_sync_reactor.go +++ /dev/null @@ -1,50 +0,0 @@ -package statesync - -import ( - "context" - "time" - - mock "github.com/stretchr/testify/mock" - state "github.com/tendermint/tendermint/state" -) - -// MockSyncReactor is an autogenerated mock type for the SyncReactor type. -// Because of the stateprovider uses in Sync(), we use package statesync instead of mocks. -type MockSyncReactor struct { - mock.Mock -} - -// Backfill provides a mock function with given fields: _a0 -func (_m *MockSyncReactor) Backfill(_a0 state.State) error { - ret := _m.Called(_a0) - - var r0 error - if rf, ok := ret.Get(0).(func(state.State) error); ok { - r0 = rf(_a0) - } else { - r0 = ret.Error(0) - } - - return r0 -} - -// Sync provides a mock function with given fields: _a0, _a1, _a2 -func (_m *MockSyncReactor) Sync(_a0 context.Context, _a1 StateProvider, _a2 time.Duration) (state.State, error) { - ret := _m.Called(_a0, _a1, _a2) - - var r0 state.State - if rf, ok := ret.Get(0).(func(context.Context, StateProvider, time.Duration) state.State); ok { - r0 = rf(_a0, _a1, _a2) - } else { - r0 = ret.Get(0).(state.State) - } - - var r1 error - if rf, ok := ret.Get(1).(func(context.Context, StateProvider, time.Duration) error); ok { - r1 = rf(_a0, _a1, _a2) - } else { - r1 = ret.Error(1) - } - - return r0, r1 -} diff --git a/internal/statesync/reactor.go b/internal/statesync/reactor.go index 59cbabd14..6c0d26812 100644 --- a/internal/statesync/reactor.go +++ b/internal/statesync/reactor.go @@ -16,6 +16,8 @@ import ( "github.com/tendermint/tendermint/internal/p2p" "github.com/tendermint/tendermint/libs/log" "github.com/tendermint/tendermint/libs/service" + "github.com/tendermint/tendermint/light" + "github.com/tendermint/tendermint/light/provider" ssproto "github.com/tendermint/tendermint/proto/tendermint/statesync" "github.com/tendermint/tendermint/proxy" sm "github.com/tendermint/tendermint/state" @@ -61,13 +63,24 @@ var ( MsgType: new(ssproto.Message), Descriptor: &p2p.ChannelDescriptor{ ID: byte(LightBlockChannel), - Priority: 2, + Priority: 5, SendQueueCapacity: 10, RecvMessageCapacity: lightBlockMsgSize, RecvBufferCapacity: 128, MaxSendBytes: 400, }, }, + ParamsChannel: { + MsgType: new(ssproto.Message), + Descriptor: &p2p.ChannelDescriptor{ + ID: byte(ParamsChannel), + Priority: 2, + SendQueueCapacity: 10, + RecvMessageCapacity: paramMsgSize, + RecvBufferCapacity: 128, + MaxSendBytes: 400, + }, + }, } ) @@ -81,6 +94,9 @@ const ( // LightBlockChannel exchanges light blocks LightBlockChannel = p2p.ChannelID(0x62) + // ParamsChannel exchanges consensus params + ParamsChannel = p2p.ChannelID(0x63) + // recentSnapshots is the number of recent snapshots to send and receive per peer. recentSnapshots = 10 @@ -91,31 +107,34 @@ const ( chunkMsgSize = int(16e6) // ~16MB // lightBlockMsgSize is the maximum size of a lightBlockResponseMessage - lightBlockMsgSize = int(1e7) // ~10MB + lightBlockMsgSize = int(1e7) // ~1MB + + // paramMsgSize is the maximum size of a paramsResponseMessage + paramMsgSize = int(1e5) // ~100kb // lightBlockResponseTimeout is how long the dispatcher waits for a peer to // return a light block - lightBlockResponseTimeout = 30 * time.Second + lightBlockResponseTimeout = 10 * time.Second + + // consensusParamsResponseTimeout is the time the p2p state provider waits + // before performing a secondary call + consensusParamsResponseTimeout = 5 * time.Second // maxLightBlockRequestRetries is the amount of retries acceptable before // the backfill process aborts maxLightBlockRequestRetries = 20 ) -// SyncReactor defines an interface used for testing abilities of node.startStateSync. -type SyncReactor interface { - Sync(context.Context, StateProvider, time.Duration) (sm.State, error) - Backfill(sm.State) error -} - // Reactor handles state sync, both restoring snapshots for the local node and // serving snapshots for other nodes. type Reactor struct { service.BaseService - cfg config.StateSyncConfig - stateStore sm.Store - blockStore *store.BlockStore + chainID string + initialHeight int64 + cfg config.StateSyncConfig + stateStore sm.Store + blockStore *store.BlockStore conn proxy.AppConnSnapshot connQuery proxy.AppConnQuery @@ -123,15 +142,22 @@ type Reactor struct { snapshotCh *p2p.Channel chunkCh *p2p.Channel blockCh *p2p.Channel + paramsCh *p2p.Channel peerUpdates *p2p.PeerUpdates closeCh chan struct{} - dispatcher *dispatcher + // Dispatcher is used to multiplex light block requests and responses over multiple + // peers used by the p2p state provider and in reverse sync. + dispatcher *Dispatcher + peers *peerList - // This will only be set when a state sync is in progress. It is used to feed - // received snapshots and chunks into the sync. - mtx tmsync.RWMutex - syncer *syncer + // These will only be set when a state sync is in progress. It is used to feed + // received snapshots and chunks into the syncer and manage incoming and outgoing + // providers. + mtx tmsync.RWMutex + syncer *syncer + providers map[types.NodeID]*BlockProvider + stateProvider StateProvider } // NewReactor returns a reference to a new state sync reactor, which implements @@ -139,29 +165,36 @@ type Reactor struct { // and querying, references to p2p Channels and a channel to listen for peer // updates on. Note, the reactor will close all p2p Channels when stopping. func NewReactor( + chainID string, + initialHeight int64, cfg config.StateSyncConfig, logger log.Logger, conn proxy.AppConnSnapshot, connQuery proxy.AppConnQuery, - snapshotCh, chunkCh, blockCh *p2p.Channel, + snapshotCh, chunkCh, blockCh, paramsCh *p2p.Channel, peerUpdates *p2p.PeerUpdates, stateStore sm.Store, blockStore *store.BlockStore, tempDir string, ) *Reactor { r := &Reactor{ - cfg: cfg, - conn: conn, - connQuery: connQuery, - snapshotCh: snapshotCh, - chunkCh: chunkCh, - blockCh: blockCh, - peerUpdates: peerUpdates, - closeCh: make(chan struct{}), - tempDir: tempDir, - dispatcher: newDispatcher(blockCh.Out, lightBlockResponseTimeout), - stateStore: stateStore, - blockStore: blockStore, + chainID: chainID, + initialHeight: initialHeight, + cfg: cfg, + conn: conn, + connQuery: connQuery, + snapshotCh: snapshotCh, + chunkCh: chunkCh, + blockCh: blockCh, + paramsCh: paramsCh, + peerUpdates: peerUpdates, + closeCh: make(chan struct{}), + tempDir: tempDir, + stateStore: stateStore, + blockStore: blockStore, + peers: newPeerList(), + dispatcher: NewDispatcher(blockCh.Out), + providers: make(map[types.NodeID]*BlockProvider), } r.BaseService = *service.NewBaseService(logger, "StateSync", r) @@ -170,26 +203,20 @@ func NewReactor( // OnStart starts separate go routines for each p2p Channel and listens for // envelopes on each. In addition, it also listens for peer updates and handles -// messages on that p2p channel accordingly. The caller must be sure to execute -// OnStop to ensure the outbound p2p Channels are closed. No error is returned. +// messages on that p2p channel accordingly. Note, we do not launch a go-routine to +// handle individual envelopes as to not have to deal with bounding workers or pools. +// The caller must be sure to execute OnStop to ensure the outbound p2p Channels are +// closed. No error is returned. func (r *Reactor) OnStart() error { - // Listen for envelopes on the snapshot p2p Channel in a separate go-routine - // as to not block or cause IO contention with the chunk p2p Channel. Note, - // we do not launch a go-routine to handle individual envelopes as to not - // have to deal with bounding workers or pools. go r.processSnapshotCh() - // Listen for envelopes on the chunk p2p Channel in a separate go-routine - // as to not block or cause IO contention with the snapshot p2p Channel. Note, - // we do not launch a go-routine to handle individual envelopes as to not - // have to deal with bounding workers or pools. go r.processChunkCh() go r.processBlockCh() - go r.processPeerUpdates() + go r.processParamsCh() - r.dispatcher.start() + go r.processPeerUpdates() return nil } @@ -198,7 +225,9 @@ func (r *Reactor) OnStart() error { // blocking until they all exit. func (r *Reactor) OnStop() { // tell the dispatcher to stop sending any more requests - r.dispatcher.stop() + r.dispatcher.Close() + // wait for any remaining requests to complete + <-r.dispatcher.Done() // Close closeCh to signal to all spawned goroutines to gracefully exit. All // p2p Channels should execute Close(). @@ -210,27 +239,27 @@ func (r *Reactor) OnStop() { <-r.snapshotCh.Done() <-r.chunkCh.Done() <-r.blockCh.Done() + <-r.paramsCh.Done() <-r.peerUpdates.Done() } // Sync runs a state sync, fetching snapshots and providing chunks to the -// application. It also saves tendermint state and runs a backfill process to -// retrieve the necessary amount of headers, commits and validators sets to be -// able to process evidence and participate in consensus. -func (r *Reactor) Sync( - ctx context.Context, - stateProvider StateProvider, - discoveryTime time.Duration, -) (sm.State, error) { +// application. At the close of the operation, Sync will bootstrap the state +// store and persist the commit at that height so that either consensus or +// blocksync can commence. It will then proceed to backfill the necessary amount +// of historical blocks before participating in consensus +func (r *Reactor) Sync(ctx context.Context) (sm.State, error) { + // We need at least two peers (for cross-referencing of light blocks) before we can + // begin state sync + r.waitForEnoughPeers(ctx, 2) r.mtx.Lock() if r.syncer != nil { r.mtx.Unlock() return sm.State{}, errors.New("a state sync is already in progress") } - if stateProvider == nil { - r.mtx.Unlock() - return sm.State{}, errors.New("the stateProvider should not be nil when doing the state sync") + if err := r.initStateProvider(ctx, r.chainID, r.initialHeight); err != nil { + return sm.State{}, err } r.syncer = newSyncer( @@ -238,12 +267,19 @@ func (r *Reactor) Sync( r.Logger, r.conn, r.connQuery, - stateProvider, + r.stateProvider, r.snapshotCh.Out, r.chunkCh.Out, r.tempDir, ) r.mtx.Unlock() + defer func() { + r.mtx.Lock() + // reset syncing objects at the close of Sync + r.syncer = nil + r.stateProvider = nil + r.mtx.Unlock() + }() requestSnapshotsHook := func() { // request snapshots from all currently connected peers @@ -253,15 +289,11 @@ func (r *Reactor) Sync( } } - state, commit, err := r.syncer.SyncAny(ctx, discoveryTime, requestSnapshotsHook) + state, commit, err := r.syncer.SyncAny(ctx, r.cfg.DiscoveryTime, requestSnapshotsHook) if err != nil { return sm.State{}, err } - r.mtx.Lock() - r.syncer = nil - r.mtx.Unlock() - err = r.stateStore.Bootstrap(state) if err != nil { return sm.State{}, fmt.Errorf("failed to bootstrap node with new state: %w", err) @@ -272,6 +304,11 @@ func (r *Reactor) Sync( return sm.State{}, fmt.Errorf("failed to store last seen commit: %w", err) } + err = r.Backfill(ctx, state) + if err != nil { + r.Logger.Error("backfill failed. Proceeding optimistically...", "err", err) + } + return state, nil } @@ -279,7 +316,7 @@ func (r *Reactor) Sync( // order. It does not stop verifying blocks until reaching a block with a height // and time that is less or equal to the stopHeight and stopTime. The // trustedBlockID should be of the header at startHeight. -func (r *Reactor) Backfill(state sm.State) error { +func (r *Reactor) Backfill(ctx context.Context, state sm.State) error { params := state.ConsensusParams.Evidence stopHeight := state.LastBlockHeight - params.MaxAgeNumBlocks stopTime := state.LastBlockTime.Add(-params.MaxAgeDuration) @@ -290,7 +327,7 @@ func (r *Reactor) Backfill(state sm.State) error { stopTime = state.LastBlockTime } return r.backfill( - context.Background(), + ctx, state.ChainID, state.LastBlockHeight, stopHeight, @@ -308,12 +345,12 @@ func (r *Reactor) backfill( stopTime time.Time, ) error { r.Logger.Info("starting backfill process...", "startHeight", startHeight, - "stopHeight", stopHeight, "trustedBlockID", trustedBlockID) + "stopHeight", stopHeight, "stopTime", stopTime, "trustedBlockID", trustedBlockID) const sleepTime = 1 * time.Second var ( lastValidatorSet *types.ValidatorSet - lastChangeHeight int64 = startHeight + lastChangeHeight = startHeight ) queue := newBlockQueue(startHeight, stopHeight, initialHeight, stopTime, maxLightBlockRequestRetries) @@ -330,8 +367,18 @@ func (r *Reactor) backfill( for { select { case height := <-queue.nextHeight(): - r.Logger.Debug("fetching next block", "height", height) - lb, peer, err := r.dispatcher.LightBlock(ctxWithCancel, height) + // pop the next peer of the list to send a request to + peer := r.peers.Pop(ctx) + r.Logger.Debug("fetching next block", "height", height, "peer", peer) + subCtx, cancel := context.WithTimeout(ctxWithCancel, lightBlockResponseTimeout) + defer cancel() + lb, err := func() (*types.LightBlock, error) { + defer cancel() + // request the light block with a timeout + return r.dispatcher.LightBlock(subCtx, height, peer) + }() + // once the peer has returned a value, add it back to the peer list to be used again + r.peers.Append(peer) if errors.Is(err, context.Canceled) { return } @@ -353,7 +400,7 @@ func (r *Reactor) backfill( queue.retry(height) // As we are fetching blocks backwards, if this node doesn't have the block it likely doesn't // have any prior ones, thus we remove it from the peer list. - r.dispatcher.removePeer(peer) + r.peers.Remove(peer) continue } @@ -450,12 +497,6 @@ func (r *Reactor) backfill( } } -// Dispatcher exposes the dispatcher so that a state provider can use it for -// light client verification -func (r *Reactor) Dispatcher() *dispatcher { //nolint:golint - return r.dispatcher -} - // handleSnapshotMessage handles envelopes sent from peers on the // SnapshotChannel. It returns an error only if the Envelope.Message is unknown // for this channel. This should never be called outside of handleMessage. @@ -498,7 +539,7 @@ func (r *Reactor) handleSnapshotMessage(envelope p2p.Envelope) error { return nil } - logger.Debug("received snapshot", "height", msg.Height, "format", msg.Format) + logger.Info("received snapshot", "height", msg.Height, "format", msg.Format) _, err := r.syncer.AddSnapshot(envelope.From, &snapshot{ Height: msg.Height, Format: msg.Format, @@ -516,6 +557,7 @@ func (r *Reactor) handleSnapshotMessage(envelope p2p.Envelope) error { ) return nil } + logger.Info("added snapshot", "height", msg.Height, "format", msg.Format) default: return fmt.Errorf("received unknown message: %T", msg) @@ -623,6 +665,15 @@ func (r *Reactor) handleLightBlockMessage(envelope p2p.Envelope) error { r.Logger.Error("failed to retrieve light block", "err", err, "height", msg.Height) return err } + if lb == nil { + r.blockCh.Out <- p2p.Envelope{ + To: envelope.From, + Message: &ssproto.LightBlockResponse{ + LightBlock: nil, + }, + } + return nil + } lbproto, err := lb.ToProto() if err != nil { @@ -640,8 +691,55 @@ func (r *Reactor) handleLightBlockMessage(envelope p2p.Envelope) error { } case *ssproto.LightBlockResponse: - if err := r.dispatcher.respond(msg.LightBlock, envelope.From); err != nil { - r.Logger.Error("error processing light block response", "err", err) + var height int64 = 0 + if msg.LightBlock != nil { + height = msg.LightBlock.SignedHeader.Header.Height + } + r.Logger.Info("received light block response", "peer", envelope.From, "height", height) + if err := r.dispatcher.Respond(msg.LightBlock, envelope.From); err != nil { + r.Logger.Error("error processing light block response", "err", err, "height", height) + } + + default: + return fmt.Errorf("received unknown message: %T", msg) + } + + return nil +} + +func (r *Reactor) handleParamsMessage(envelope p2p.Envelope) error { + switch msg := envelope.Message.(type) { + case *ssproto.ParamsRequest: + r.Logger.Debug("received consensus params request", "height", msg.Height) + cp, err := r.stateStore.LoadConsensusParams(int64(msg.Height)) + if err != nil { + r.Logger.Error("failed to fetch requested consensus params", "err", err, "height", msg.Height) + return nil + } + + cpproto := cp.ToProto() + r.paramsCh.Out <- p2p.Envelope{ + To: envelope.From, + Message: &ssproto.ParamsResponse{ + Height: msg.Height, + ConsensusParams: cpproto, + }, + } + + case *ssproto.ParamsResponse: + r.mtx.RLock() + defer r.mtx.RUnlock() + r.Logger.Debug("received consensus params response", "height", msg.Height) + + cp := types.ConsensusParamsFromProto(msg.ConsensusParams) + + if sp, ok := r.stateProvider.(*stateProviderP2P); ok { + select { + case sp.paramsRecvCh <- cp: + default: + } + } else { + r.Logger.Debug("received unexpected params response; using RPC state provider", "peer", envelope.From) } default: @@ -678,6 +776,9 @@ func (r *Reactor) handleMessage(chID p2p.ChannelID, envelope p2p.Envelope) (err case LightBlockChannel: err = r.handleLightBlockMessage(envelope) + case ParamsChannel: + err = r.handleParamsMessage(envelope) + default: err = fmt.Errorf("unknown channel ID (%d) for envelope (%v)", chID, envelope) } @@ -703,6 +804,10 @@ func (r *Reactor) processBlockCh() { r.processCh(r.blockCh, "light block") } +func (r *Reactor) processParamsCh() { + r.processCh(r.paramsCh, "consensus params") +} + // processCh routes state sync messages to their respective handlers. Any error // encountered during message execution will result in a PeerError being sent on // the respective channel. When the reactor is stopped, we will catch the signal @@ -732,24 +837,38 @@ func (r *Reactor) processCh(ch *p2p.Channel, chName string) { // processPeerUpdate processes a PeerUpdate, returning an error upon failing to // handle the PeerUpdate or if a panic is recovered. func (r *Reactor) processPeerUpdate(peerUpdate p2p.PeerUpdate) { - r.Logger.Debug("received peer update", "peer", peerUpdate.NodeID, "status", peerUpdate.Status) - - r.mtx.RLock() - defer r.mtx.RUnlock() + r.Logger.Info("received peer update", "peer", peerUpdate.NodeID, "status", peerUpdate.Status) switch peerUpdate.Status { case p2p.PeerStatusUp: - if r.syncer != nil { - r.syncer.AddPeer(peerUpdate.NodeID) + r.peers.Append(peerUpdate.NodeID) + case p2p.PeerStatusDown: + r.peers.Remove(peerUpdate.NodeID) + } + + r.mtx.Lock() + if r.syncer == nil { + r.mtx.Unlock() + return + } + defer r.mtx.Unlock() + + switch peerUpdate.Status { + case p2p.PeerStatusUp: + newProvider := NewBlockProvider(peerUpdate.NodeID, r.chainID, r.dispatcher) + r.providers[peerUpdate.NodeID] = newProvider + r.syncer.AddPeer(peerUpdate.NodeID) + if sp, ok := r.stateProvider.(*stateProviderP2P); ok { + // we do this in a separate routine to not block whilst waiting for the light client to finish + // whatever call it's currently executing + go sp.addProvider(newProvider) } - r.dispatcher.addPeer(peerUpdate.NodeID) case p2p.PeerStatusDown: - if r.syncer != nil { - r.syncer.RemovePeer(peerUpdate.NodeID) - } - r.dispatcher.removePeer(peerUpdate.NodeID) + delete(r.providers, peerUpdate.NodeID) + r.syncer.RemovePeer(peerUpdate.NodeID) } + r.Logger.Info("processed peer update", "peer", peerUpdate.NodeID, "status", peerUpdate.Status) } // processPeerUpdates initiates a blocking process where we listen for and handle @@ -839,5 +958,50 @@ func (r *Reactor) fetchLightBlock(height uint64) (*types.LightBlock, error) { }, ValidatorSet: vals, }, nil - +} + +func (r *Reactor) waitForEnoughPeers(ctx context.Context, numPeers int) { + t := time.NewTicker(200 * time.Millisecond) + defer t.Stop() + for { + select { + case <-ctx.Done(): + return + case <-t.C: + if r.peers.Len() >= numPeers { + return + } + } + } +} + +func (r *Reactor) initStateProvider(ctx context.Context, chainID string, initialHeight int64) error { + var err error + to := light.TrustOptions{ + Period: r.cfg.TrustPeriod, + Height: r.cfg.TrustHeight, + Hash: r.cfg.TrustHashBytes(), + } + spLogger := r.Logger.With("module", "stateprovider") + spLogger.Info("initializing state provider", "trustPeriod", to.Period, + "trustHeight", to.Height, "useP2P", r.cfg.UseP2P) + + if r.cfg.UseP2P { + peers := r.peers.All() + providers := make([]provider.Provider, len(peers)) + for idx, p := range peers { + providers[idx] = NewBlockProvider(p, chainID, r.dispatcher) + } + + r.stateProvider, err = NewP2PStateProvider(ctx, chainID, initialHeight, providers, to, r.paramsCh.Out, spLogger) + if err != nil { + return fmt.Errorf("failed to initialize P2P state provider: %w", err) + } + } else { + r.stateProvider, err = NewRPCStateProvider(ctx, chainID, initialHeight, r.cfg.RPCServers, to, spLogger) + if err != nil { + return fmt.Errorf("failed to initialize RPC state provider: %w", err) + } + } + return nil } diff --git a/internal/statesync/reactor_test.go b/internal/statesync/reactor_test.go index 9bff72679..6373ed6ab 100644 --- a/internal/statesync/reactor_test.go +++ b/internal/statesync/reactor_test.go @@ -3,6 +3,7 @@ package statesync import ( "context" "fmt" + "strings" "sync" "testing" "time" @@ -21,6 +22,7 @@ import ( "github.com/tendermint/tendermint/light/provider" ssproto "github.com/tendermint/tendermint/proto/tendermint/statesync" tmproto "github.com/tendermint/tendermint/proto/tendermint/types" + "github.com/tendermint/tendermint/proxy" proxymocks "github.com/tendermint/tendermint/proxy/mocks" smmocks "github.com/tendermint/tendermint/state/mocks" "github.com/tendermint/tendermint/store" @@ -50,6 +52,11 @@ type reactorTestSuite struct { blockOutCh chan p2p.Envelope blockPeerErrCh chan p2p.PeerError + paramsChannel *p2p.Channel + paramsInCh chan p2p.Envelope + paramsOutCh chan p2p.Envelope + paramsPeerErrCh chan p2p.PeerError + peerUpdateCh chan p2p.PeerUpdate peerUpdates *p2p.PeerUpdates @@ -86,6 +93,9 @@ func setup( blockInCh: make(chan p2p.Envelope, chBuf), blockOutCh: make(chan p2p.Envelope, chBuf), blockPeerErrCh: make(chan p2p.PeerError, chBuf), + paramsInCh: make(chan p2p.Envelope, chBuf), + paramsOutCh: make(chan p2p.Envelope, chBuf), + paramsPeerErrCh: make(chan p2p.PeerError, chBuf), conn: conn, connQuery: connQuery, stateProvider: stateProvider, @@ -118,12 +128,22 @@ func setup( rts.blockPeerErrCh, ) + rts.paramsChannel = p2p.NewChannel( + ParamsChannel, + new(ssproto.Message), + rts.paramsInCh, + rts.paramsOutCh, + rts.paramsPeerErrCh, + ) + rts.stateStore = &smmocks.Store{} rts.blockStore = store.NewBlockStore(dbm.NewMemDB()) cfg := config.DefaultStateSyncConfig() rts.reactor = NewReactor( + factory.DefaultTestChainID, + 1, *cfg, log.TestingLogger(), conn, @@ -131,15 +151,13 @@ func setup( rts.snapshotChannel, rts.chunkChannel, rts.blockChannel, + rts.paramsChannel, rts.peerUpdates, rts.stateStore, rts.blockStore, "", ) - // override the dispatcher with one with a shorter timeout - rts.reactor.dispatcher = newDispatcher(rts.blockChannel.Out, 1*time.Second) - rts.syncer = newSyncer( *cfg, log.NewNopLogger(), @@ -162,6 +180,58 @@ func setup( return rts } +func TestReactor_Sync(t *testing.T) { + const snapshotHeight = 7 + rts := setup(t, nil, nil, nil, 2) + chain := buildLightBlockChain(t, 1, 10, time.Now()) + // app accepts any snapshot + rts.conn.On("OfferSnapshotSync", ctx, mock.AnythingOfType("types.RequestOfferSnapshot")). + Return(&abci.ResponseOfferSnapshot{Result: abci.ResponseOfferSnapshot_ACCEPT}, nil) + + // app accepts every chunk + rts.conn.On("ApplySnapshotChunkSync", ctx, mock.AnythingOfType("types.RequestApplySnapshotChunk")). + Return(&abci.ResponseApplySnapshotChunk{Result: abci.ResponseApplySnapshotChunk_ACCEPT}, nil) + + // app query returns valid state app hash + rts.connQuery.On("InfoSync", ctx, proxy.RequestInfo).Return(&abci.ResponseInfo{ + AppVersion: 9, + LastBlockHeight: snapshotHeight, + LastBlockAppHash: chain[snapshotHeight+1].AppHash, + }, nil) + + // store accepts state and validator sets + rts.stateStore.On("Bootstrap", mock.AnythingOfType("state.State")).Return(nil) + rts.stateStore.On("SaveValidatorSets", mock.AnythingOfType("int64"), mock.AnythingOfType("int64"), + mock.AnythingOfType("*types.ValidatorSet")).Return(nil) + + closeCh := make(chan struct{}) + defer close(closeCh) + go handleLightBlockRequests(t, chain, rts.blockOutCh, + rts.blockInCh, closeCh, 0) + go graduallyAddPeers(rts.peerUpdateCh, closeCh, 1*time.Second) + go handleSnapshotRequests(t, rts.snapshotOutCh, rts.snapshotInCh, closeCh, []snapshot{ + { + Height: uint64(snapshotHeight), + Format: 1, + Chunks: 1, + }, + }) + + go handleChunkRequests(t, rts.chunkOutCh, rts.chunkInCh, closeCh, []byte("abc")) + + go handleConsensusParamsRequest(t, rts.paramsOutCh, rts.paramsInCh, closeCh) + + // update the config to use the p2p provider + rts.reactor.cfg.UseP2P = true + rts.reactor.cfg.TrustHeight = 1 + rts.reactor.cfg.TrustHash = fmt.Sprintf("%X", chain[1].Hash()) + rts.reactor.cfg.DiscoveryTime = 1 * time.Second + + // Run state sync + _, err := rts.reactor.Sync(context.Background()) + require.NoError(t, err) +} + func TestReactor_ChunkRequest_InvalidRequest(t *testing.T) { rts := setup(t, nil, nil, nil, 2) @@ -370,7 +440,7 @@ func TestReactor_LightBlockResponse(t *testing.T) { } } -func TestReactor_Dispatcher(t *testing.T) { +func TestReactor_BlockProviders(t *testing.T) { rts := setup(t, nil, nil, nil, 2) rts.peerUpdateCh <- p2p.PeerUpdate{ NodeID: types.NodeID("aa"), @@ -387,9 +457,13 @@ func TestReactor_Dispatcher(t *testing.T) { chain := buildLightBlockChain(t, 1, 10, time.Now()) go handleLightBlockRequests(t, chain, rts.blockOutCh, rts.blockInCh, closeCh, 0) - dispatcher := rts.reactor.Dispatcher() - providers := dispatcher.Providers(factory.DefaultTestChainID, 5*time.Second) - require.Len(t, providers, 2) + peers := rts.reactor.peers.All() + require.Len(t, peers, 2) + + providers := make([]provider.Provider, len(peers)) + for idx, peer := range peers { + providers[idx] = NewBlockProvider(peer, factory.DefaultTestChainID, rts.reactor.dispatcher) + } wg := sync.WaitGroup{} @@ -416,6 +490,59 @@ func TestReactor_Dispatcher(t *testing.T) { t.Fail() case <-ctx.Done(): } + +} + +func TestReactor_StateProviderP2P(t *testing.T) { + rts := setup(t, nil, nil, nil, 2) + // make syncer non nil else test won't think we are state syncing + rts.reactor.syncer = rts.syncer + peerA := types.NodeID(strings.Repeat("a", 2*types.NodeIDByteLength)) + peerB := types.NodeID(strings.Repeat("b", 2*types.NodeIDByteLength)) + rts.peerUpdateCh <- p2p.PeerUpdate{ + NodeID: peerA, + Status: p2p.PeerStatusUp, + } + rts.peerUpdateCh <- p2p.PeerUpdate{ + NodeID: peerB, + Status: p2p.PeerStatusUp, + } + + closeCh := make(chan struct{}) + defer close(closeCh) + + chain := buildLightBlockChain(t, 1, 10, time.Now()) + go handleLightBlockRequests(t, chain, rts.blockOutCh, rts.blockInCh, closeCh, 0) + go handleConsensusParamsRequest(t, rts.paramsOutCh, rts.paramsInCh, closeCh) + + rts.reactor.cfg.UseP2P = true + rts.reactor.cfg.TrustHeight = 1 + rts.reactor.cfg.TrustHash = fmt.Sprintf("%X", chain[1].Hash()) + ctx := context.Background() + rts.reactor.mtx.Lock() + err := rts.reactor.initStateProvider(ctx, factory.DefaultTestChainID, 1) + rts.reactor.mtx.Unlock() + require.NoError(t, err) + rts.reactor.syncer.stateProvider = rts.reactor.stateProvider + + appHash, err := rts.reactor.stateProvider.AppHash(ctx, 5) + require.NoError(t, err) + require.Len(t, appHash, 32) + + state, err := rts.reactor.stateProvider.State(ctx, 5) + require.NoError(t, err) + require.Equal(t, appHash, state.AppHash) + require.Equal(t, types.DefaultConsensusParams(), &state.ConsensusParams) + + commit, err := rts.reactor.stateProvider.Commit(ctx, 5) + require.NoError(t, err) + require.Equal(t, commit.BlockID, state.LastBlockID) + + added, err := rts.reactor.syncer.AddSnapshot(peerA, &snapshot{ + Height: 1, Format: 2, Chunks: 7, Hash: []byte{1, 2}, Metadata: []byte{1}, + }) + require.NoError(t, err) + require.True(t, added) } func TestReactor_Backfill(t *testing.T) { @@ -494,7 +621,6 @@ func retryUntil(t *testing.T, fn func() bool, timeout time.Duration) { if fn() { return } - require.NoError(t, ctx.Err()) } } @@ -523,7 +649,9 @@ func handleLightBlockRequests(t *testing.T, } else { switch errorCount % 3 { case 0: // send a different block - differntLB, err := mockLB(t, int64(msg.Height), factory.DefaultTestTime, factory.MakeBlockID()).ToProto() + vals, pv := factory.RandValidatorSet(3, 10) + _, _, lb := mockLB(t, int64(msg.Height), factory.DefaultTestTime, factory.MakeBlockID(), vals, pv) + differntLB, err := lb.ToProto() require.NoError(t, err) sending <- p2p.Envelope{ From: envelope.To, @@ -550,37 +678,147 @@ func handleLightBlockRequests(t *testing.T, } } +func handleConsensusParamsRequest(t *testing.T, receiving, sending chan p2p.Envelope, closeCh chan struct{}) { + t.Helper() + params := types.DefaultConsensusParams() + paramsProto := params.ToProto() + for { + select { + case envelope := <-receiving: + t.Log("received consensus params request") + msg, ok := envelope.Message.(*ssproto.ParamsRequest) + require.True(t, ok) + sending <- p2p.Envelope{ + From: envelope.To, + Message: &ssproto.ParamsResponse{ + Height: msg.Height, + ConsensusParams: paramsProto, + }, + } + + case <-closeCh: + return + } + } +} + func buildLightBlockChain(t *testing.T, fromHeight, toHeight int64, startTime time.Time) map[int64]*types.LightBlock { chain := make(map[int64]*types.LightBlock, toHeight-fromHeight) lastBlockID := factory.MakeBlockID() - blockTime := startTime.Add(-5 * time.Minute) + blockTime := startTime.Add(time.Duration(fromHeight-toHeight) * time.Minute) + vals, pv := factory.RandValidatorSet(3, 10) for height := fromHeight; height < toHeight; height++ { - chain[height] = mockLB(t, height, blockTime, lastBlockID) + vals, pv, chain[height] = mockLB(t, height, blockTime, lastBlockID, vals, pv) lastBlockID = factory.MakeBlockIDWithHash(chain[height].Header.Hash()) blockTime = blockTime.Add(1 * time.Minute) } return chain } -func mockLB(t *testing.T, height int64, time time.Time, - lastBlockID types.BlockID) *types.LightBlock { +func mockLB(t *testing.T, height int64, time time.Time, lastBlockID types.BlockID, + currentVals *types.ValidatorSet, currentPrivVals []types.PrivValidator, +) (*types.ValidatorSet, []types.PrivValidator, *types.LightBlock) { header, err := factory.MakeHeader(&types.Header{ Height: height, LastBlockID: lastBlockID, Time: time, }) require.NoError(t, err) - vals, pv := factory.RandValidatorSet(3, 10) - header.ValidatorsHash = vals.Hash() + nextVals, nextPrivVals := factory.RandValidatorSet(3, 10) + header.ValidatorsHash = currentVals.Hash() + header.NextValidatorsHash = nextVals.Hash() + header.ConsensusHash = types.DefaultConsensusParams().HashConsensusParams() lastBlockID = factory.MakeBlockIDWithHash(header.Hash()) - voteSet := types.NewVoteSet(factory.DefaultTestChainID, height, 0, tmproto.PrecommitType, vals) - commit, err := factory.MakeCommit(lastBlockID, height, 0, voteSet, pv, time) + voteSet := types.NewVoteSet(factory.DefaultTestChainID, height, 0, tmproto.PrecommitType, currentVals) + commit, err := factory.MakeCommit(lastBlockID, height, 0, voteSet, currentPrivVals, time) require.NoError(t, err) - return &types.LightBlock{ + return nextVals, nextPrivVals, &types.LightBlock{ SignedHeader: &types.SignedHeader{ Header: header, Commit: commit, }, - ValidatorSet: vals, + ValidatorSet: currentVals, + } +} + +// graduallyAddPeers delivers a new randomly-generated peer update on peerUpdateCh once +// per interval, until closeCh is closed. Each peer update is assigned a random node ID. +func graduallyAddPeers( + peerUpdateCh chan p2p.PeerUpdate, + closeCh chan struct{}, + interval time.Duration, +) { + ticker := time.NewTicker(interval) + for { + select { + case <-ticker.C: + peerUpdateCh <- p2p.PeerUpdate{ + NodeID: factory.RandomNodeID(), + Status: p2p.PeerStatusUp, + } + case <-closeCh: + return + } + } +} + +func handleSnapshotRequests( + t *testing.T, + receivingCh chan p2p.Envelope, + sendingCh chan p2p.Envelope, + closeCh chan struct{}, + snapshots []snapshot, +) { + t.Helper() + for { + select { + case envelope := <-receivingCh: + _, ok := envelope.Message.(*ssproto.SnapshotsRequest) + require.True(t, ok) + for _, snapshot := range snapshots { + sendingCh <- p2p.Envelope{ + From: envelope.To, + Message: &ssproto.SnapshotsResponse{ + Height: snapshot.Height, + Format: snapshot.Format, + Chunks: snapshot.Chunks, + Hash: snapshot.Hash, + Metadata: snapshot.Metadata, + }, + } + } + case <-closeCh: + return + } + } +} + +func handleChunkRequests( + t *testing.T, + receivingCh chan p2p.Envelope, + sendingCh chan p2p.Envelope, + closeCh chan struct{}, + chunk []byte, +) { + t.Helper() + for { + select { + case envelope := <-receivingCh: + msg, ok := envelope.Message.(*ssproto.ChunkRequest) + require.True(t, ok) + sendingCh <- p2p.Envelope{ + From: envelope.To, + Message: &ssproto.ChunkResponse{ + Height: msg.Height, + Format: msg.Format, + Index: msg.Index, + Chunk: chunk, + Missing: false, + }, + } + + case <-closeCh: + return + } } } diff --git a/internal/statesync/snapshots.go b/internal/statesync/snapshots.go index 9058304a9..a0620e450 100644 --- a/internal/statesync/snapshots.go +++ b/internal/statesync/snapshots.go @@ -1,13 +1,11 @@ package statesync import ( - "context" "crypto/sha256" "fmt" "math/rand" "sort" "strings" - "time" tmsync "github.com/tendermint/tendermint/internal/libs/sync" "github.com/tendermint/tendermint/types" @@ -43,8 +41,6 @@ func (s *snapshot) Key() snapshotKey { // snapshotPool discovers and aggregates snapshots across peers. type snapshotPool struct { - stateProvider StateProvider - tmsync.Mutex snapshots map[snapshotKey]*snapshot snapshotPeers map[snapshotKey]map[types.NodeID]types.NodeID @@ -60,10 +56,9 @@ type snapshotPool struct { snapshotBlacklist map[snapshotKey]bool } -// newSnapshotPool creates a new snapshot pool. The state source is used for -func newSnapshotPool(stateProvider StateProvider) *snapshotPool { +// newSnapshotPool creates a new empty snapshot pool. +func newSnapshotPool() *snapshotPool { return &snapshotPool{ - stateProvider: stateProvider, snapshots: make(map[snapshotKey]*snapshot), snapshotPeers: make(map[snapshotKey]map[types.NodeID]types.NodeID), formatIndex: make(map[uint32]map[snapshotKey]bool), @@ -80,14 +75,6 @@ func newSnapshotPool(stateProvider StateProvider) *snapshotPool { // snapshot height is verified using the light client, and the expected app hash // is set for the snapshot. func (p *snapshotPool) Add(peerID types.NodeID, snapshot *snapshot) (bool, error) { - ctx, cancel := context.WithTimeout(context.TODO(), 30*time.Second) - defer cancel() - - appHash, err := p.stateProvider.AppHash(ctx, snapshot.Height) - if err != nil { - return false, fmt.Errorf("failed to get app hash: %w", err) - } - snapshot.trustedAppHash = appHash key := snapshot.Key() p.Lock() diff --git a/internal/statesync/snapshots_test.go b/internal/statesync/snapshots_test.go index 6f27269f7..08cb08269 100644 --- a/internal/statesync/snapshots_test.go +++ b/internal/statesync/snapshots_test.go @@ -3,10 +3,8 @@ package statesync import ( "testing" - "github.com/stretchr/testify/mock" "github.com/stretchr/testify/require" - "github.com/tendermint/tendermint/internal/statesync/mocks" "github.com/tendermint/tendermint/types" ) @@ -39,13 +37,10 @@ func TestSnapshot_Key(t *testing.T) { } func TestSnapshotPool_Add(t *testing.T) { - stateProvider := &mocks.StateProvider{} - stateProvider.On("AppHash", mock.Anything, uint64(1)).Return([]byte("app_hash"), nil) - peerID := types.NodeID("aa") // Adding to the pool should work - pool := newSnapshotPool(stateProvider) + pool := newSnapshotPool() added, err := pool.Add(peerID, &snapshot{ Height: 1, Format: 1, @@ -66,18 +61,12 @@ func TestSnapshotPool_Add(t *testing.T) { require.NoError(t, err) require.False(t, added) - // The pool should have populated the snapshot with the trusted app hash snapshot := pool.Best() require.NotNil(t, snapshot) - require.Equal(t, []byte("app_hash"), snapshot.trustedAppHash) - - stateProvider.AssertExpectations(t) } func TestSnapshotPool_GetPeer(t *testing.T) { - stateProvider := &mocks.StateProvider{} - stateProvider.On("AppHash", mock.Anything, mock.Anything).Return([]byte("app_hash"), nil) - pool := newSnapshotPool(stateProvider) + pool := newSnapshotPool() s := &snapshot{Height: 1, Format: 1, Chunks: 1, Hash: []byte{1}} @@ -112,9 +101,7 @@ func TestSnapshotPool_GetPeer(t *testing.T) { } func TestSnapshotPool_GetPeers(t *testing.T) { - stateProvider := &mocks.StateProvider{} - stateProvider.On("AppHash", mock.Anything, mock.Anything).Return([]byte("app_hash"), nil) - pool := newSnapshotPool(stateProvider) + pool := newSnapshotPool() s := &snapshot{Height: 1, Format: 1, Chunks: 1, Hash: []byte{1}} @@ -137,9 +124,7 @@ func TestSnapshotPool_GetPeers(t *testing.T) { } func TestSnapshotPool_Ranked_Best(t *testing.T) { - stateProvider := &mocks.StateProvider{} - stateProvider.On("AppHash", mock.Anything, mock.Anything).Return([]byte("app_hash"), nil) - pool := newSnapshotPool(stateProvider) + pool := newSnapshotPool() // snapshots in expected order (best to worst). Highest height wins, then highest format. // Snapshots with different chunk hashes are considered different, and the most peers is @@ -182,9 +167,7 @@ func TestSnapshotPool_Ranked_Best(t *testing.T) { } func TestSnapshotPool_Reject(t *testing.T) { - stateProvider := &mocks.StateProvider{} - stateProvider.On("AppHash", mock.Anything, mock.Anything).Return([]byte("app_hash"), nil) - pool := newSnapshotPool(stateProvider) + pool := newSnapshotPool() peerID := types.NodeID("aa") @@ -212,9 +195,7 @@ func TestSnapshotPool_Reject(t *testing.T) { } func TestSnapshotPool_RejectFormat(t *testing.T) { - stateProvider := &mocks.StateProvider{} - stateProvider.On("AppHash", mock.Anything, mock.Anything).Return([]byte("app_hash"), nil) - pool := newSnapshotPool(stateProvider) + pool := newSnapshotPool() peerID := types.NodeID("aa") @@ -243,9 +224,7 @@ func TestSnapshotPool_RejectFormat(t *testing.T) { } func TestSnapshotPool_RejectPeer(t *testing.T) { - stateProvider := &mocks.StateProvider{} - stateProvider.On("AppHash", mock.Anything, mock.Anything).Return([]byte("app_hash"), nil) - pool := newSnapshotPool(stateProvider) + pool := newSnapshotPool() peerAID := types.NodeID("aa") peerBID := types.NodeID("bb") @@ -285,9 +264,7 @@ func TestSnapshotPool_RejectPeer(t *testing.T) { } func TestSnapshotPool_RemovePeer(t *testing.T) { - stateProvider := &mocks.StateProvider{} - stateProvider.On("AppHash", mock.Anything, mock.Anything).Return([]byte("app_hash"), nil) - pool := newSnapshotPool(stateProvider) + pool := newSnapshotPool() peerAID := types.NodeID("aa") peerBID := types.NodeID("bb") diff --git a/internal/statesync/stateprovider.go b/internal/statesync/stateprovider.go index fd889dc51..b58cb35de 100644 --- a/internal/statesync/stateprovider.go +++ b/internal/statesync/stateprovider.go @@ -1,7 +1,9 @@ package statesync import ( + "bytes" "context" + "errors" "fmt" "strings" "time" @@ -9,21 +11,25 @@ import ( dbm "github.com/tendermint/tm-db" tmsync "github.com/tendermint/tendermint/internal/libs/sync" + "github.com/tendermint/tendermint/internal/p2p" "github.com/tendermint/tendermint/libs/log" "github.com/tendermint/tendermint/light" lightprovider "github.com/tendermint/tendermint/light/provider" lighthttp "github.com/tendermint/tendermint/light/provider/http" lightrpc "github.com/tendermint/tendermint/light/rpc" lightdb "github.com/tendermint/tendermint/light/store/db" + ssproto "github.com/tendermint/tendermint/proto/tendermint/statesync" rpchttp "github.com/tendermint/tendermint/rpc/client/http" sm "github.com/tendermint/tendermint/state" "github.com/tendermint/tendermint/types" + "github.com/tendermint/tendermint/version" ) //go:generate ../../scripts/mockery_generate.sh StateProvider // StateProvider is a provider of trusted state data for bootstrapping a node. This refers -// to the state.State object, not the state machine. +// to the state.State object, not the state machine. There are two implementations. One +// uses the P2P layer and the other uses the RPC layer. Both use light client verification. type StateProvider interface { // AppHash returns the app hash after the given height has been committed. AppHash(ctx context.Context, height uint64) ([]byte, error) @@ -33,20 +39,17 @@ type StateProvider interface { State(ctx context.Context, height uint64) (sm.State, error) } -// lightClientStateProvider is a state provider using the light client. -type lightClientStateProvider struct { +type stateProviderRPC struct { tmsync.Mutex // light.Client is not concurrency-safe lc *light.Client - version sm.Version initialHeight int64 providers map[lightprovider.Provider]string } -// NewLightClientStateProvider creates a new StateProvider using a light client and RPC clients. -func NewLightClientStateProvider( +// NewRPCStateProvider creates a new StateProvider using a light client and RPC clients. +func NewRPCStateProvider( ctx context.Context, chainID string, - version sm.Version, initialHeight int64, servers []string, trustOptions light.TrustOptions, @@ -75,51 +78,17 @@ func NewLightClientStateProvider( if err != nil { return nil, err } - return &lightClientStateProvider{ + return &stateProviderRPC{ lc: lc, - version: version, initialHeight: initialHeight, providers: providerRemotes, }, nil } -// NewLightClientStateProviderFromDispatcher creates a light client state -// provider but uses a p2p connected dispatched instead of RPC endpoints -func NewLightClientStateProviderFromDispatcher( - ctx context.Context, - chainID string, - version sm.Version, - initialHeight int64, - dispatcher *dispatcher, - trustOptions light.TrustOptions, - logger log.Logger, -) (StateProvider, error) { - providers := dispatcher.Providers(chainID, 30*time.Second) - if len(providers) < 2 { - return nil, fmt.Errorf("at least 2 peers are required, got %d", len(providers)) - } - - providersMap := make(map[lightprovider.Provider]string) - for _, p := range providers { - providersMap[p] = p.(*blockProvider).String() - } - - lc, err := light.NewClient(ctx, chainID, trustOptions, providers[0], providers[1:], - lightdb.New(dbm.NewMemDB()), light.Logger(logger)) - if err != nil { - return nil, err - } - - return &lightClientStateProvider{ - lc: lc, - version: version, - initialHeight: initialHeight, - providers: providersMap, - }, nil -} - -// AppHash implements StateProvider. -func (s *lightClientStateProvider) AppHash(ctx context.Context, height uint64) ([]byte, error) { +// AppHash implements part of StateProvider. It calls the application to verify the +// light blocks at heights h+1 and h+2 and, if verification succeeds, reports the app +// hash for the block at height h+1 which correlates to the state at height h. +func (s *stateProviderRPC) AppHash(ctx context.Context, height uint64) ([]byte, error) { s.Lock() defer s.Unlock() @@ -128,27 +97,19 @@ func (s *lightClientStateProvider) AppHash(ctx context.Context, height uint64) ( if err != nil { return nil, err } - // We also try to fetch the blocks at height H and H+2, since we need these + + // We also try to fetch the blocks at H+2, since we need these // when building the state while restoring the snapshot. This avoids the race // condition where we try to restore a snapshot before H+2 exists. - // - // FIXME This is a hack, since we can't add new methods to the interface without - // breaking it. We should instead have a Has(ctx, height) method which checks - // that the state provider has access to the necessary data for the height. - // We piggyback on AppHash() since it's called when adding snapshots to the pool. _, err = s.lc.VerifyLightBlockAtHeight(ctx, int64(height+2), time.Now()) if err != nil { return nil, err } - _, err = s.lc.VerifyLightBlockAtHeight(ctx, int64(height), time.Now()) - if err != nil { - return nil, err - } return header.AppHash, nil } // Commit implements StateProvider. -func (s *lightClientStateProvider) Commit(ctx context.Context, height uint64) (*types.Commit, error) { +func (s *stateProviderRPC) Commit(ctx context.Context, height uint64) (*types.Commit, error) { s.Lock() defer s.Unlock() header, err := s.lc.VerifyLightBlockAtHeight(ctx, int64(height), time.Now()) @@ -159,13 +120,12 @@ func (s *lightClientStateProvider) Commit(ctx context.Context, height uint64) (* } // State implements StateProvider. -func (s *lightClientStateProvider) State(ctx context.Context, height uint64) (sm.State, error) { +func (s *stateProviderRPC) State(ctx context.Context, height uint64) (sm.State, error) { s.Lock() defer s.Unlock() state := sm.State{ ChainID: s.lc.ChainID(), - Version: s.version, InitialHeight: s.initialHeight, } if state.InitialHeight == 0 { @@ -193,6 +153,10 @@ func (s *lightClientStateProvider) State(ctx context.Context, height uint64) (sm return sm.State{}, err } + state.Version = sm.Version{ + Consensus: currentLightBlock.Version, + Software: version.TMVersion, + } state.LastBlockHeight = lastLightBlock.Height state.LastBlockTime = lastLightBlock.Time state.LastBlockID = lastLightBlock.Commit.BlockID @@ -229,9 +193,188 @@ func rpcClient(server string) (*rpchttp.HTTP, error) { if !strings.Contains(server, "://") { server = "http://" + server } - c, err := rpchttp.New(server) + return rpchttp.New(server) +} + +type stateProviderP2P struct { + tmsync.Mutex // light.Client is not concurrency-safe + lc *light.Client + initialHeight int64 + paramsSendCh chan<- p2p.Envelope + paramsRecvCh chan types.ConsensusParams +} + +// NewP2PStateProvider creates a light client state +// provider but uses a dispatcher connected to the P2P layer +func NewP2PStateProvider( + ctx context.Context, + chainID string, + initialHeight int64, + providers []lightprovider.Provider, + trustOptions light.TrustOptions, + paramsSendCh chan<- p2p.Envelope, + logger log.Logger, +) (StateProvider, error) { + if len(providers) < 2 { + return nil, fmt.Errorf("at least 2 peers are required, got %d", len(providers)) + } + + lc, err := light.NewClient(ctx, chainID, trustOptions, providers[0], providers[1:], + lightdb.New(dbm.NewMemDB()), light.Logger(logger)) if err != nil { return nil, err } - return c, nil + + return &stateProviderP2P{ + lc: lc, + initialHeight: initialHeight, + paramsSendCh: paramsSendCh, + paramsRecvCh: make(chan types.ConsensusParams), + }, nil +} + +// AppHash implements StateProvider. +func (s *stateProviderP2P) AppHash(ctx context.Context, height uint64) ([]byte, error) { + s.Lock() + defer s.Unlock() + + // We have to fetch the next height, which contains the app hash for the previous height. + header, err := s.lc.VerifyLightBlockAtHeight(ctx, int64(height+1), time.Now()) + if err != nil { + return nil, err + } + + // We also try to fetch the blocks at H+2, since we need these + // when building the state while restoring the snapshot. This avoids the race + // condition where we try to restore a snapshot before H+2 exists. + _, err = s.lc.VerifyLightBlockAtHeight(ctx, int64(height+2), time.Now()) + if err != nil { + return nil, err + } + return header.AppHash, nil +} + +// Commit implements StateProvider. +func (s *stateProviderP2P) Commit(ctx context.Context, height uint64) (*types.Commit, error) { + s.Lock() + defer s.Unlock() + header, err := s.lc.VerifyLightBlockAtHeight(ctx, int64(height), time.Now()) + if err != nil { + return nil, err + } + return header.Commit, nil +} + +// State implements StateProvider. +func (s *stateProviderP2P) State(ctx context.Context, height uint64) (sm.State, error) { + s.Lock() + defer s.Unlock() + + state := sm.State{ + ChainID: s.lc.ChainID(), + InitialHeight: s.initialHeight, + } + if state.InitialHeight == 0 { + state.InitialHeight = 1 + } + + // The snapshot height maps onto the state heights as follows: + // + // height: last block, i.e. the snapshotted height + // height+1: current block, i.e. the first block we'll process after the snapshot + // height+2: next block, i.e. the second block after the snapshot + // + // We need to fetch the NextValidators from height+2 because if the application changed + // the validator set at the snapshot height then this only takes effect at height+2. + lastLightBlock, err := s.lc.VerifyLightBlockAtHeight(ctx, int64(height), time.Now()) + if err != nil { + return sm.State{}, err + } + currentLightBlock, err := s.lc.VerifyLightBlockAtHeight(ctx, int64(height+1), time.Now()) + if err != nil { + return sm.State{}, err + } + nextLightBlock, err := s.lc.VerifyLightBlockAtHeight(ctx, int64(height+2), time.Now()) + if err != nil { + return sm.State{}, err + } + + state.Version = sm.Version{ + Consensus: currentLightBlock.Version, + Software: version.TMVersion, + } + state.LastBlockHeight = lastLightBlock.Height + state.LastBlockTime = lastLightBlock.Time + state.LastBlockID = lastLightBlock.Commit.BlockID + state.AppHash = currentLightBlock.AppHash + state.LastResultsHash = currentLightBlock.LastResultsHash + state.LastValidators = lastLightBlock.ValidatorSet + state.Validators = currentLightBlock.ValidatorSet + state.NextValidators = nextLightBlock.ValidatorSet + state.LastHeightValidatorsChanged = nextLightBlock.Height + + // We'll also need to fetch consensus params via P2P. + state.ConsensusParams, err = s.consensusParams(ctx, currentLightBlock.Height) + if err != nil { + return sm.State{}, err + } + // validate the consensus params + if !bytes.Equal(nextLightBlock.ConsensusHash, state.ConsensusParams.HashConsensusParams()) { + return sm.State{}, fmt.Errorf("consensus params hash mismatch at height %d. Expected %v, got %v", + currentLightBlock.Height, nextLightBlock.ConsensusHash, state.ConsensusParams.HashConsensusParams()) + } + // set the last height changed to the current height + state.LastHeightConsensusParamsChanged = currentLightBlock.Height + + return state, nil +} + +// addProvider dynamically adds a peer as a new witness. A limit of 6 providers is kept as a +// heuristic. Too many overburdens the network and too little compromises the second layer of security. +func (s *stateProviderP2P) addProvider(p lightprovider.Provider) { + if len(s.lc.Witnesses()) < 6 { + s.lc.AddProvider(p) + } +} + +// consensusParams sends out a request for consensus params blocking until one is returned. +// If it fails to get a valid set of consensus params from any of the providers it returns an error. +func (s *stateProviderP2P) consensusParams(ctx context.Context, height int64) (types.ConsensusParams, error) { + for _, provider := range s.lc.Witnesses() { + p, ok := provider.(*BlockProvider) + if !ok { + panic("expected p2p state provider to use p2p block providers") + } + + // extract the nodeID of the provider + peer, err := types.NewNodeID(p.String()) + if err != nil { + return types.ConsensusParams{}, fmt.Errorf("invalid provider (%s) node id: %w", p.String(), err) + } + + select { + case s.paramsSendCh <- p2p.Envelope{ + To: peer, + Message: &ssproto.ParamsRequest{ + Height: uint64(height), + }, + }: + case <-ctx.Done(): + return types.ConsensusParams{}, ctx.Err() + } + + select { + // if we get no response from this provider we move on to the next one + case <-time.After(consensusParamsResponseTimeout): + continue + case <-ctx.Done(): + return types.ConsensusParams{}, ctx.Err() + case params, ok := <-s.paramsRecvCh: + if !ok { + return types.ConsensusParams{}, errors.New("params channel closed") + } + return params, nil + } + } + return types.ConsensusParams{}, errors.New("unable to fetch consensus params from connected providers") } diff --git a/internal/statesync/syncer.go b/internal/statesync/syncer.go index 5dc8aeb8c..559e98a8f 100644 --- a/internal/statesync/syncer.go +++ b/internal/statesync/syncer.go @@ -12,6 +12,7 @@ import ( tmsync "github.com/tendermint/tendermint/internal/libs/sync" "github.com/tendermint/tendermint/internal/p2p" "github.com/tendermint/tendermint/libs/log" + "github.com/tendermint/tendermint/light" ssproto "github.com/tendermint/tendermint/proto/tendermint/statesync" "github.com/tendermint/tendermint/proxy" sm "github.com/tendermint/tendermint/state" @@ -40,14 +41,11 @@ var ( errRejectSender = errors.New("snapshot sender was rejected") // errVerifyFailed is returned by Sync() when app hash or last height // verification fails. - errVerifyFailed = errors.New("verification failed") + errVerifyFailed = errors.New("verification with app failed") // errTimeout is returned by Sync() when we've waited too long to receive a chunk. errTimeout = errors.New("timed out waiting for chunk") // errNoSnapshots is returned by SyncAny() if no snapshots are found and discovery is disabled. errNoSnapshots = errors.New("no suitable snapshots found") - // errStateCommitTimeout is returned by Sync() when the timeout for retrieving - // tendermint state or the commit is exceeded - errStateCommitTimeout = errors.New("timed out trying to retrieve state and commit") ) // syncer runs a state sync against an ABCI app. Use either SyncAny() to automatically attempt to @@ -84,7 +82,7 @@ func newSyncer( stateProvider: stateProvider, conn: conn, connQuery: connQuery, - snapshots: newSnapshotPool(stateProvider), + snapshots: newSnapshotPool(), snapshotCh: snapshotCh, chunkCh: chunkCh, tempDir: tempDir, @@ -153,7 +151,6 @@ func (s *syncer) SyncAny( discoveryTime time.Duration, requestSnapshots func(), ) (sm.State, *types.Commit, error) { - if discoveryTime != 0 && discoveryTime < minimumDiscoveryTime { discoveryTime = minimumDiscoveryTime } @@ -181,7 +178,6 @@ func (s *syncer) SyncAny( if discoveryTime == 0 { return sm.State{}, nil, errNoSnapshots } - requestSnapshots() s.logger.Info(fmt.Sprintf("Discovering snapshots for %v", discoveryTime)) time.Sleep(discoveryTime) continue @@ -230,10 +226,6 @@ func (s *syncer) SyncAny( s.logger.Info("Snapshot sender rejected", "peer", peer) } - case errors.Is(err, errStateCommitTimeout): - s.logger.Info("Timed out retrieving state and commit, rejecting and retrying...", "height", snapshot.Height) - s.snapshots.Reject(snapshot) - default: return sm.State{}, nil, fmt.Errorf("snapshot restoration failed: %w", err) } @@ -264,8 +256,29 @@ func (s *syncer) Sync(ctx context.Context, snapshot *snapshot, chunks *chunkQueu s.mtx.Unlock() }() + hctx, hcancel := context.WithTimeout(ctx, 30*time.Second) + defer hcancel() + + // Fetch the app hash corresponding to the snapshot + appHash, err := s.stateProvider.AppHash(hctx, snapshot.Height) + if err != nil { + // check if the main context was triggered + if ctx.Err() != nil { + return sm.State{}, nil, ctx.Err() + } + // catch the case where all the light client providers have been exhausted + if err == light.ErrNoWitnesses { + return sm.State{}, nil, + fmt.Errorf("failed to get app hash at height %d. No witnesses remaining", snapshot.Height) + } + s.logger.Info("failed to get and verify tendermint state. Dropping snapshot and trying again", + "err", err, "height", snapshot.Height) + return sm.State{}, nil, errRejectSnapshot + } + snapshot.trustedAppHash = appHash + // Offer snapshot to ABCI app. - err := s.offerSnapshot(ctx, snapshot) + err = s.offerSnapshot(ctx, snapshot) if err != nil { return sm.State{}, nil, err } @@ -277,27 +290,37 @@ func (s *syncer) Sync(ctx context.Context, snapshot *snapshot, chunks *chunkQueu go s.fetchChunks(fetchCtx, snapshot, chunks) } - pctx, pcancel := context.WithTimeout(ctx, 30*time.Second) + pctx, pcancel := context.WithTimeout(ctx, 1*time.Minute) defer pcancel() // Optimistically build new state, so we don't discover any light client failures at the end. state, err := s.stateProvider.State(pctx, snapshot.Height) if err != nil { - // check if the provider context exceeded the 10 second deadline - if err == context.DeadlineExceeded && ctx.Err() == nil { - return sm.State{}, nil, errStateCommitTimeout + // check if the main context was triggered + if ctx.Err() != nil { + return sm.State{}, nil, ctx.Err() } - - return sm.State{}, nil, fmt.Errorf("failed to build new state: %w", err) + if err == light.ErrNoWitnesses { + return sm.State{}, nil, + fmt.Errorf("failed to get tendermint state at height %d. No witnesses remaining", snapshot.Height) + } + s.logger.Info("failed to get and verify tendermint state. Dropping snapshot and trying again", + "err", err, "height", snapshot.Height) + return sm.State{}, nil, errRejectSnapshot } commit, err := s.stateProvider.Commit(pctx, snapshot.Height) if err != nil { // check if the provider context exceeded the 10 second deadline - if err == context.DeadlineExceeded && ctx.Err() == nil { - return sm.State{}, nil, errStateCommitTimeout + if ctx.Err() != nil { + return sm.State{}, nil, ctx.Err() } - - return sm.State{}, nil, fmt.Errorf("failed to fetch commit: %w", err) + if err == light.ErrNoWitnesses { + return sm.State{}, nil, + fmt.Errorf("failed to get commit at height %d. No witnesses remaining", snapshot.Height) + } + s.logger.Info("failed to get and verify commit. Dropping snapshot and trying again", + "err", err, "height", snapshot.Height) + return sm.State{}, nil, errRejectSnapshot } // Restore snapshot diff --git a/internal/test/factory/doc.go b/internal/test/factory/doc.go new file mode 100644 index 000000000..5b6b313f6 --- /dev/null +++ b/internal/test/factory/doc.go @@ -0,0 +1,6 @@ +/* +Package factory provides generation code for common structs in Tendermint. +It is used primarily for the testing of internal components such as statesync, +consensus, blocksync etc.. +*/ +package factory diff --git a/internal/test/factory/factory_test.go b/internal/test/factory/factory_test.go index 25f234508..07a3ef8b3 100644 --- a/internal/test/factory/factory_test.go +++ b/internal/test/factory/factory_test.go @@ -12,3 +12,7 @@ func TestMakeHeader(t *testing.T) { _, err := MakeHeader(&types.Header{}) assert.NoError(t, err) } + +func TestRandomNodeID(t *testing.T) { + assert.NotPanics(t, func() { RandomNodeID() }) +} diff --git a/internal/test/factory/p2p.go b/internal/test/factory/p2p.go new file mode 100644 index 000000000..34c139f58 --- /dev/null +++ b/internal/test/factory/p2p.go @@ -0,0 +1,27 @@ +package factory + +import ( + "encoding/hex" + "strings" + + "github.com/tendermint/tendermint/libs/rand" + "github.com/tendermint/tendermint/types" +) + +// NodeID returns a valid NodeID based on an inputted string +func NodeID(str string) types.NodeID { + id, err := types.NewNodeID(strings.Repeat(str, 2*types.NodeIDByteLength)) + if err != nil { + panic(err) + } + return id +} + +// RandomNodeID returns a randomly generated valid NodeID +func RandomNodeID() types.NodeID { + id, err := types.NewNodeID(hex.EncodeToString(rand.Bytes(types.NodeIDByteLength))) + if err != nil { + panic(err) + } + return id +} diff --git a/light/client.go b/light/client.go index 52bbdf981..cc606f496 100644 --- a/light/client.go +++ b/light/client.go @@ -52,6 +52,8 @@ const ( // 10s is sufficient for most networks. defaultMaxBlockLag = 10 * time.Second + + defaultProviderTimeout = 10 * time.Second ) // Option sets a parameter for the light client. @@ -61,9 +63,7 @@ type Option func(*Client) // check the blocks (every block, in ascending height order). Note this is // much slower than SkippingVerification, albeit more secure. func SequentialVerification() Option { - return func(c *Client) { - c.verificationMode = sequential - } + return func(c *Client) { c.verificationMode = sequential } } // SkippingVerification option configures the light client to skip blocks as @@ -87,24 +87,18 @@ func SkippingVerification(trustLevel tmmath.Fraction) Option { // the h amount of light blocks will be removed from the store. // Default: 1000. A pruning size of 0 will not prune the light client at all. func PruningSize(h uint16) Option { - return func(c *Client) { - c.pruningSize = h - } + return func(c *Client) { c.pruningSize = h } } // Logger option can be used to set a logger for the client. func Logger(l log.Logger) Option { - return func(c *Client) { - c.logger = l - } + return func(c *Client) { c.logger = l } } // MaxClockDrift defines how much new header's time can drift into // the future relative to the light clients local time. Default: 10s. func MaxClockDrift(d time.Duration) Option { - return func(c *Client) { - c.maxClockDrift = d - } + return func(c *Client) { c.maxClockDrift = d } } // MaxBlockLag represents the maximum time difference between the realtime @@ -116,9 +110,13 @@ func MaxClockDrift(d time.Duration) Option { // was 12:00. Then the lag here is 5 minutes. // Default: 10s func MaxBlockLag(d time.Duration) Option { - return func(c *Client) { - c.maxBlockLag = d - } + return func(c *Client) { c.maxBlockLag = d } +} + +// Provider timeout is the maximum time that the light client will wait for a +// provider to respond with a light block. +func ProviderTimeout(d time.Duration) Option { + return func(c *Client) { c.providerTimeout = d } } // Client represents a light client, connected to a single chain, which gets @@ -133,6 +131,7 @@ type Client struct { trustLevel tmmath.Fraction maxClockDrift time.Duration maxBlockLag time.Duration + providerTimeout time.Duration // Mutex for locking during changes of the light clients providers providerMutex tmsync.Mutex @@ -197,12 +196,13 @@ func NewClient( chainID: chainID, trustingPeriod: trustOptions.Period, verificationMode: skipping, - trustLevel: DefaultTrustLevel, - maxClockDrift: defaultMaxClockDrift, - maxBlockLag: defaultMaxBlockLag, primary: primary, witnesses: witnesses, trustedStore: trustedStore, + trustLevel: DefaultTrustLevel, + maxClockDrift: defaultMaxClockDrift, + maxBlockLag: defaultMaxBlockLag, + providerTimeout: defaultProviderTimeout, pruningSize: defaultPruningSize, logger: log.NewNopLogger(), } @@ -379,6 +379,7 @@ func (c *Client) Update(ctx context.Context, now time.Time) (*types.LightBlock, return nil, err } + // If there is a new light block then verify it if latestBlock.Height > lastTrustedHeight { err = c.verifyLightBlock(ctx, latestBlock, now) if err != nil { @@ -388,7 +389,8 @@ func (c *Client) Update(ctx context.Context, now time.Time) (*types.LightBlock, return latestBlock, nil } - return nil, nil + // else return the latestTrustedBlock + return c.latestTrustedBlock, nil } // VerifyLightBlockAtHeight fetches the light block at the given height @@ -693,7 +695,9 @@ func (c *Client) verifySkipping( if depth == len(blockCache)-1 { // schedule what the next height we need to fetch is pivotHeight := c.schedule(verifiedBlock.Height, blockCache[depth].Height) - interimBlock, providerErr := source.LightBlock(ctx, pivotHeight) + subCtx, cancel := context.WithTimeout(ctx, c.providerTimeout) + defer cancel() + interimBlock, providerErr := c.getLightBlock(subCtx, source, pivotHeight) if providerErr != nil { return nil, ErrVerificationFailed{From: verifiedBlock.Height, To: pivotHeight, Reason: providerErr} } @@ -930,7 +934,7 @@ func (c *Client) backwards( // any other error, the primary is permanently dropped and is replaced by a witness. func (c *Client) lightBlockFromPrimary(ctx context.Context, height int64) (*types.LightBlock, error) { c.providerMutex.Lock() - l, err := c.primary.LightBlock(ctx, height) + l, err := c.getLightBlock(ctx, c.primary, height) c.providerMutex.Unlock() switch err { @@ -957,6 +961,16 @@ func (c *Client) lightBlockFromPrimary(ctx context.Context, height int64) (*type } } +func (c *Client) getLightBlock(ctx context.Context, p provider.Provider, height int64) (*types.LightBlock, error) { + subCtx, cancel := context.WithTimeout(ctx, c.providerTimeout) + defer cancel() + l, err := p.LightBlock(subCtx, height) + if err == context.DeadlineExceeded || ctx.Err() != nil { + return nil, provider.ErrNoResponse + } + return l, err +} + // NOTE: requires a providerMutex lock func (c *Client) removeWitnesses(indexes []int) error { // check that we will still have witnesses remaining @@ -989,7 +1003,7 @@ func (c *Client) findNewPrimary(ctx context.Context, height int64, remove bool) c.providerMutex.Lock() defer c.providerMutex.Unlock() - if len(c.witnesses) <= 1 { + if len(c.witnesses) < 1 { return nil, ErrNoWitnesses } @@ -1001,7 +1015,7 @@ func (c *Client) findNewPrimary(ctx context.Context, height int64, remove bool) ) // send out a light block request to all witnesses - subctx, cancel := context.WithCancel(ctx) + subctx, cancel := context.WithTimeout(ctx, c.providerTimeout) defer cancel() for index := range c.witnesses { wg.Add(1) diff --git a/light/client_test.go b/light/client_test.go index e8a478a53..291a3e5b1 100644 --- a/light/client_test.go +++ b/light/client_test.go @@ -644,7 +644,7 @@ func TestClientReplacesPrimaryWithWitnessIfPrimaryIsUnavailable(t *testing.T) { chainID, trustOptions, mockDeadNode, - []provider.Provider{mockFullNode, mockFullNode}, + []provider.Provider{mockDeadNode, mockFullNode}, dbs.New(dbm.NewMemDB()), light.Logger(log.TestingLogger()), ) @@ -663,6 +663,32 @@ func TestClientReplacesPrimaryWithWitnessIfPrimaryIsUnavailable(t *testing.T) { mockFullNode.AssertExpectations(t) } +func TestClientReplacesPrimaryWithWitnessIfPrimaryDoesntHaveBlock(t *testing.T) { + mockFullNode := &provider_mocks.Provider{} + mockFullNode.On("LightBlock", mock.Anything, mock.Anything).Return(l1, nil) + + mockDeadNode := &provider_mocks.Provider{} + mockDeadNode.On("LightBlock", mock.Anything, mock.Anything).Return(nil, provider.ErrLightBlockNotFound) + c, err := light.NewClient( + ctx, + chainID, + trustOptions, + mockDeadNode, + []provider.Provider{mockDeadNode, mockFullNode}, + dbs.New(dbm.NewMemDB()), + light.Logger(log.TestingLogger()), + ) + require.NoError(t, err) + _, err = c.Update(ctx, bTime.Add(2*time.Hour)) + require.NoError(t, err) + + // we should still have the dead node as a witness because it + // hasn't repeatedly been unresponsive yet + assert.Equal(t, 2, len(c.Witnesses())) + mockDeadNode.AssertExpectations(t) + mockFullNode.AssertExpectations(t) +} + func TestClient_BackwardsVerification(t *testing.T) { { headers, vals, _ := genLightBlocksWithKeys(chainID, 9, 3, 0, bTime) @@ -724,51 +750,32 @@ func TestClient_BackwardsVerification(t *testing.T) { } { - testCases := []struct { - headers map[int64]*types.SignedHeader - vals map[int64]*types.ValidatorSet - }{ - { - // 7) provides incorrect height - headers: map[int64]*types.SignedHeader{ - 2: keys.GenSignedHeader(chainID, 1, bTime.Add(30*time.Minute), nil, vals, vals, - hash("app_hash"), hash("cons_hash"), hash("results_hash"), 0, len(keys)), - 3: h3, - }, - vals: valSet, - }, - { - // 8) provides incorrect hash - headers: map[int64]*types.SignedHeader{ - 2: keys.GenSignedHeader(chainID, 2, bTime.Add(30*time.Minute), nil, vals, vals, - hash("app_hash2"), hash("cons_hash23"), hash("results_hash30"), 0, len(keys)), - 3: h3, - }, - vals: valSet, - }, + // 8) provides incorrect hash + headers := map[int64]*types.SignedHeader{ + 2: keys.GenSignedHeader(chainID, 2, bTime.Add(30*time.Minute), nil, vals, vals, + hash("app_hash2"), hash("cons_hash23"), hash("results_hash30"), 0, len(keys)), + 3: h3, } + vals := valSet + mockNode := mockNodeFromHeadersAndVals(headers, vals) + c, err := light.NewClient( + ctx, + chainID, + light.TrustOptions{ + Period: 1 * time.Hour, + Height: 3, + Hash: h3.Hash(), + }, + mockNode, + []provider.Provider{mockNode}, + dbs.New(dbm.NewMemDB()), + light.Logger(log.TestingLogger()), + ) + require.NoError(t, err) - for idx, tc := range testCases { - mockNode := mockNodeFromHeadersAndVals(tc.headers, tc.vals) - c, err := light.NewClient( - ctx, - chainID, - light.TrustOptions{ - Period: 1 * time.Hour, - Height: 3, - Hash: h3.Hash(), - }, - mockNode, - []provider.Provider{mockNode}, - dbs.New(dbm.NewMemDB()), - light.Logger(log.TestingLogger()), - ) - require.NoError(t, err, idx) - - _, err = c.VerifyLightBlockAtHeight(ctx, 2, bTime.Add(1*time.Hour).Add(1*time.Second)) - assert.Error(t, err, idx) - mockNode.AssertExpectations(t) - } + _, err = c.VerifyLightBlockAtHeight(ctx, 2, bTime.Add(1*time.Hour).Add(1*time.Second)) + assert.Error(t, err) + mockNode.AssertExpectations(t) } } diff --git a/light/detector.go b/light/detector.go index 32a0c3f1e..ddb0bc4ed 100644 --- a/light/detector.go +++ b/light/detector.go @@ -110,7 +110,7 @@ func (c *Client) detectDivergence(ctx context.Context, primaryTrace []*types.Lig func (c *Client) compareNewHeaderWithWitness(ctx context.Context, errc chan error, h *types.SignedHeader, witness provider.Provider, witnessIndex int) { - lightBlock, err := witness.LightBlock(ctx, h.Height) + lightBlock, err := c.getLightBlock(ctx, witness, h.Height) switch err { // no error means we move on to checking the hash of the two headers case nil: @@ -331,7 +331,7 @@ func (c *Client) examineConflictingHeaderAgainstTrace( if traceBlock.Height == targetBlock.Height { sourceBlock = targetBlock } else { - sourceBlock, err = source.LightBlock(ctx, traceBlock.Height) + sourceBlock, err = c.getLightBlock(ctx, source, traceBlock.Height) if err != nil { return nil, nil, fmt.Errorf("failed to examine trace: %w", err) } @@ -379,7 +379,7 @@ func (c *Client) getTargetBlockOrLatest( height int64, witness provider.Provider, ) (bool, *types.LightBlock, error) { - lightBlock, err := witness.LightBlock(ctx, 0) + lightBlock, err := c.getLightBlock(ctx, witness, 0) if err != nil { return false, nil, err } @@ -394,7 +394,7 @@ func (c *Client) getTargetBlockOrLatest( // the witness has caught up. We recursively call the function again. However in order // to avoud a wild goose chase where the witness sends us one header below and one header // above the height we set a timeout to the context - lightBlock, err := witness.LightBlock(ctx, height) + lightBlock, err := c.getLightBlock(ctx, witness, height) return true, lightBlock, err } diff --git a/light/rpc/client.go b/light/rpc/client.go index 48cf7ce73..84761fb04 100644 --- a/light/rpc/client.go +++ b/light/rpc/client.go @@ -341,7 +341,7 @@ func (c *Client) Block(ctx context.Context, height *int64) (*ctypes.ResultBlock, } // BlockByHash calls rpcclient#BlockByHash and then verifies the result. -func (c *Client) BlockByHash(ctx context.Context, hash []byte) (*ctypes.ResultBlock, error) { +func (c *Client) BlockByHash(ctx context.Context, hash tmbytes.HexBytes) (*ctypes.ResultBlock, error) { res, err := c.next.BlockByHash(ctx, hash) if err != nil { return nil, err @@ -454,7 +454,7 @@ func (c *Client) Commit(ctx context.Context, height *int64) (*ctypes.ResultCommi // Tx calls rpcclient#Tx method and then verifies the proof if such was // requested. -func (c *Client) Tx(ctx context.Context, hash []byte, prove bool) (*ctypes.ResultTx, error) { +func (c *Client) Tx(ctx context.Context, hash tmbytes.HexBytes, prove bool) (*ctypes.ResultTx, error) { res, err := c.next.Tx(ctx, hash, prove) if err != nil || !prove { return res, err diff --git a/networks/remote/ansible/inventory/digital_ocean.py b/networks/remote/ansible/inventory/digital_ocean.py index 24ba64370..383b329a1 100755 --- a/networks/remote/ansible/inventory/digital_ocean.py +++ b/networks/remote/ansible/inventory/digital_ocean.py @@ -1,6 +1,6 @@ #!/usr/bin/env python -''' +""" DigitalOcean external inventory script ====================================== @@ -22,7 +22,7 @@ found. You can force this script to use the cache with --force-cache. ---- Configuration is read from `digital_ocean.ini`, then from environment variables, -then and command-line arguments. +and then from command-line arguments. Most notably, the DigitalOcean API Token must be specified. It can be specified in the INI file or with the following environment variables: @@ -40,6 +40,7 @@ is to use the output of the --env option with export: The following groups are generated from --list: - ID (droplet ID) - NAME (droplet NAME) + - digital_ocean - image_ID - image_NAME - distro_NAME (distribution NAME from image) @@ -73,14 +74,12 @@ For each host, the following variables are registered: ----- ``` -usage: digital_ocean.py [-h] [--list] [--host HOST] [--all] - [--droplets] [--regions] [--images] [--sizes] - [--ssh-keys] [--domains] [--pretty] - [--cache-path CACHE_PATH] - [--cache-max_age CACHE_MAX_AGE] - [--force-cache] - [--refresh-cache] - [--api-token API_TOKEN] +usage: digital_ocean.py [-h] [--list] [--host HOST] [--all] [--droplets] + [--regions] [--images] [--sizes] [--ssh-keys] + [--domains] [--tags] [--pretty] + [--cache-path CACHE_PATH] + [--cache-max_age CACHE_MAX_AGE] [--force-cache] + [--refresh-cache] [--env] [--api-token API_TOKEN] Produce an Ansible Inventory file based on DigitalOcean credentials @@ -91,65 +90,129 @@ optional arguments: --host HOST Get all Ansible inventory variables about a specific Droplet --all List all DigitalOcean information as JSON - --droplets List Droplets as JSON + --droplets, -d List Droplets as JSON --regions List Regions as JSON --images List Images as JSON --sizes List Sizes as JSON --ssh-keys List SSH keys as JSON --domains List Domains as JSON + --tags List Tags as JSON --pretty, -p Pretty-print results --cache-path CACHE_PATH Path to the cache files (default: .) --cache-max_age CACHE_MAX_AGE Maximum age of the cached items (default: 0) --force-cache Only use data from the cache - --refresh-cache Force refresh of cache by making API requests to + --refresh-cache, -r Force refresh of cache by making API requests to DigitalOcean (default: False - use cache files) + --env, -e Display DO_API_TOKEN --api-token API_TOKEN, -a API_TOKEN DigitalOcean API Token ``` -''' +""" # (c) 2013, Evan Wies +# (c) 2017, Ansible Project +# (c) 2017, Abhijeet Kasurde # # Inspired by the EC2 inventory plugin: # https://github.com/ansible/ansible/blob/devel/contrib/inventory/ec2.py # -# This file is part of Ansible, -# -# Ansible is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# Ansible is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with Ansible. If not, see . +# GNU General Public License v3.0+ (see COPYING or https://www.gnu.org/licenses/gpl-3.0.txt) + +from __future__ import (absolute_import, division, print_function) +__metaclass__ = type ###################################################################### -import os -import sys -import re import argparse -from time import time -import ConfigParser import ast +import os +import re +import requests +import sys +from time import time try: - import json + import ConfigParser except ImportError: - import simplejson as json + import configparser as ConfigParser -try: - from dopy.manager import DoManager -except ImportError as e: - sys.exit("failed=True msg='`dopy` library required for this script'") +import json + + +class DoManager: + def __init__(self, api_token): + self.api_token = api_token + self.api_endpoint = 'https://api.digitalocean.com/v2' + self.headers = {'Authorization': 'Bearer {0}'.format(self.api_token), + 'Content-type': 'application/json'} + self.timeout = 60 + + def _url_builder(self, path): + if path[0] == '/': + path = path[1:] + return '%s/%s' % (self.api_endpoint, path) + + def send(self, url, method='GET', data=None): + url = self._url_builder(url) + data = json.dumps(data) + try: + if method == 'GET': + resp_data = {} + incomplete = True + while incomplete: + resp = requests.get(url, data=data, headers=self.headers, timeout=self.timeout) + json_resp = resp.json() + + for key, value in json_resp.items(): + if isinstance(value, list) and key in resp_data: + resp_data[key] += value + else: + resp_data[key] = value + + try: + url = json_resp['links']['pages']['next'] + except KeyError: + incomplete = False + + except ValueError as e: + sys.exit("Unable to parse result from %s: %s" % (url, e)) + return resp_data + + def all_active_droplets(self): + resp = self.send('droplets/') + return resp['droplets'] + + def all_regions(self): + resp = self.send('regions/') + return resp['regions'] + + def all_images(self, filter_name='global'): + params = {'filter': filter_name} + resp = self.send('images/', data=params) + return resp['images'] + + def sizes(self): + resp = self.send('sizes/') + return resp['sizes'] + + def all_ssh_keys(self): + resp = self.send('account/keys') + return resp['ssh_keys'] + + def all_domains(self): + resp = self.send('domains/') + return resp['domains'] + + def show_droplet(self, droplet_id): + resp = self.send('droplets/%s' % droplet_id) + return resp['droplet'] + + def all_tags(self): + resp = self.send('tags') + return resp['tags'] class DigitalOceanInventory(object): @@ -159,7 +222,7 @@ class DigitalOceanInventory(object): ########################################################################### def __init__(self): - ''' Main execution path ''' + """Main execution path """ # DigitalOceanInventory data self.data = {} # All DigitalOcean data @@ -178,9 +241,9 @@ class DigitalOceanInventory(object): # Verify credentials were set if not hasattr(self, 'api_token'): - sys.stderr.write('''Could not find values for DigitalOcean api_token. -They must be specified via either ini file, command line argument (--api-token), -or environment variables (DO_API_TOKEN)\n''') + msg = 'Could not find values for DigitalOcean api_token. They must be specified via either ini file, ' \ + 'command line argument (--api-token), or environment variables (DO_API_TOKEN)\n' + sys.stderr.write(msg) sys.exit(-1) # env command, show DigitalOcean credentials @@ -196,10 +259,10 @@ or environment variables (DO_API_TOKEN)\n''') self.load_from_cache() if len(self.data) == 0: if self.args.force_cache: - sys.stderr.write('''Cache is empty and --force-cache was specified\n''') + sys.stderr.write('Cache is empty and --force-cache was specified\n') sys.exit(-1) - self.manager = DoManager(None, self.api_token, api_version=2) + self.manager = DoManager(self.api_token) # Pick the json_data to print based on the CLI command if self.args.droplets: @@ -220,6 +283,9 @@ or environment variables (DO_API_TOKEN)\n''') elif self.args.domains: self.load_from_digital_ocean('domains') json_data = {'domains': self.data['domains']} + elif self.args.tags: + self.load_from_digital_ocean('tags') + json_data = {'tags': self.data['tags']} elif self.args.all: self.load_from_digital_ocean() json_data = self.data @@ -234,19 +300,19 @@ or environment variables (DO_API_TOKEN)\n''') self.write_to_cache() if self.args.pretty: - print(json.dumps(json_data, sort_keys=True, indent=2)) + print(json.dumps(json_data, indent=2)) else: print(json.dumps(json_data)) - # That's all she wrote... ########################################################################### # Script configuration ########################################################################### def read_settings(self): - ''' Reads the settings from the digital_ocean.ini file ''' - config = ConfigParser.SafeConfigParser() - config.read(os.path.dirname(os.path.realpath(__file__)) + '/digital_ocean.ini') + """ Reads the settings from the digital_ocean.ini file """ + config = ConfigParser.ConfigParser() + config_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'digital_ocean.ini') + config.read(config_path) # Credentials if config.has_option('digital_ocean', 'api_token'): @@ -267,7 +333,7 @@ or environment variables (DO_API_TOKEN)\n''') self.group_variables = ast.literal_eval(config.get('digital_ocean', 'group_variables')) def read_environment(self): - ''' Reads the settings from environment variables ''' + """ Reads the settings from environment variables """ # Setup credentials if os.getenv("DO_API_TOKEN"): self.api_token = os.getenv("DO_API_TOKEN") @@ -275,7 +341,7 @@ or environment variables (DO_API_TOKEN)\n''') self.api_token = os.getenv("DO_API_KEY") def read_cli_args(self): - ''' Command line argument processing ''' + """ Command line argument processing """ parser = argparse.ArgumentParser(description='Produce an Ansible Inventory file based on DigitalOcean credentials') parser.add_argument('--list', action='store_true', help='List all active Droplets as Ansible inventory (default: True)') @@ -288,6 +354,7 @@ or environment variables (DO_API_TOKEN)\n''') parser.add_argument('--sizes', action='store_true', help='List Sizes as JSON') parser.add_argument('--ssh-keys', action='store_true', help='List SSH keys as JSON') parser.add_argument('--domains', action='store_true', help='List Domains as JSON') + parser.add_argument('--tags', action='store_true', help='List Tags as JSON') parser.add_argument('--pretty', '-p', action='store_true', help='Pretty-print results') @@ -309,6 +376,7 @@ or environment variables (DO_API_TOKEN)\n''') if (not self.args.droplets and not self.args.regions and not self.args.images and not self.args.sizes and not self.args.ssh_keys and not self.args.domains and + not self.args.tags and not self.args.all and not self.args.host): self.args.list = True @@ -317,7 +385,7 @@ or environment variables (DO_API_TOKEN)\n''') ########################################################################### def load_from_digital_ocean(self, resource=None): - '''Get JSON from DigitalOcean API''' + """Get JSON from DigitalOcean API """ if self.args.force_cache and os.path.isfile(self.cache_filename): return # We always get fresh droplets @@ -333,7 +401,7 @@ or environment variables (DO_API_TOKEN)\n''') self.data['regions'] = self.manager.all_regions() self.cache_refreshed = True if resource == 'images' or resource is None: - self.data['images'] = self.manager.all_images(filter=None) + self.data['images'] = self.manager.all_images() self.cache_refreshed = True if resource == 'sizes' or resource is None: self.data['sizes'] = self.manager.sizes() @@ -344,9 +412,27 @@ or environment variables (DO_API_TOKEN)\n''') if resource == 'domains' or resource is None: self.data['domains'] = self.manager.all_domains() self.cache_refreshed = True + if resource == 'tags' or resource is None: + self.data['tags'] = self.manager.all_tags() + self.cache_refreshed = True + + def add_inventory_group(self, key): + """ Method to create group dict """ + host_dict = {'hosts': [], 'vars': {}} + self.inventory[key] = host_dict + return + + def add_host(self, group, host): + """ Helper method to reduce host duplication """ + if group not in self.inventory: + self.add_inventory_group(group) + + if host not in self.inventory[group]['hosts']: + self.inventory[group]['hosts'].append(host) + return def build_inventory(self): - '''Build Ansible inventory of droplets''' + """ Build Ansible inventory of droplets """ self.inventory = { 'all': { 'hosts': [], @@ -357,52 +443,44 @@ or environment variables (DO_API_TOKEN)\n''') # add all droplets by id and name for droplet in self.data['droplets']: - # when using private_networking, the API reports the private one in "ip_address". - if 'private_networking' in droplet['features'] and not self.use_private_network: - for net in droplet['networks']['v4']: - if net['type'] == 'public': - dest = net['ip_address'] - else: - continue - else: - dest = droplet['ip_address'] + for net in droplet['networks']['v4']: + if net['type'] == 'public': + dest = net['ip_address'] + else: + continue self.inventory['all']['hosts'].append(dest) - self.inventory[droplet['id']] = [dest] - self.inventory[droplet['name']] = [dest] + self.add_host(droplet['id'], dest) + + self.add_host(droplet['name'], dest) # groups that are always present - for group in ('region_' + droplet['region']['slug'], + for group in ('digital_ocean', + 'region_' + droplet['region']['slug'], 'image_' + str(droplet['image']['id']), 'size_' + droplet['size']['slug'], - 'distro_' + self.to_safe(droplet['image']['distribution']), + 'distro_' + DigitalOceanInventory.to_safe(droplet['image']['distribution']), 'status_' + droplet['status']): - if group not in self.inventory: - self.inventory[group] = {'hosts': [], 'vars': {}} - self.inventory[group]['hosts'].append(dest) + self.add_host(group, dest) # groups that are not always present for group in (droplet['image']['slug'], droplet['image']['name']): if group: - image = 'image_' + self.to_safe(group) - if image not in self.inventory: - self.inventory[image] = {'hosts': [], 'vars': {}} - self.inventory[image]['hosts'].append(dest) + image = 'image_' + DigitalOceanInventory.to_safe(group) + self.add_host(image, dest) if droplet['tags']: for tag in droplet['tags']: - if tag not in self.inventory: - self.inventory[tag] = {'hosts': [], 'vars': {}} - self.inventory[tag]['hosts'].append(dest) + self.add_host(tag, dest) # hostvars info = self.do_namespace(droplet) self.inventory['_meta']['hostvars'][dest] = info def load_droplet_variables_for_host(self): - '''Generate a JSON response to a --host call''' + """ Generate a JSON response to a --host call """ host = int(self.args.host) droplet = self.manager.show_droplet(host) info = self.do_namespace(droplet) @@ -413,7 +491,7 @@ or environment variables (DO_API_TOKEN)\n''') ########################################################################### def is_cache_valid(self): - ''' Determines if the cache files have expired, or if it is still valid ''' + """ Determines if the cache files have expired, or if it is still valid """ if os.path.isfile(self.cache_filename): mod_time = os.path.getmtime(self.cache_filename) current_time = time() @@ -422,11 +500,10 @@ or environment variables (DO_API_TOKEN)\n''') return False def load_from_cache(self): - ''' Reads the data from the cache file and assigns it to member variables as Python Objects''' + """ Reads the data from the cache file and assigns it to member variables as Python Objects """ try: - cache = open(self.cache_filename, 'r') - json_data = cache.read() - cache.close() + with open(self.cache_filename, 'r') as cache: + json_data = cache.read() data = json.loads(json_data) except IOError: data = {'data': {}, 'inventory': {}} @@ -435,31 +512,24 @@ or environment variables (DO_API_TOKEN)\n''') self.inventory = data['inventory'] def write_to_cache(self): - ''' Writes data in JSON format to a file ''' + """ Writes data in JSON format to a file """ data = {'data': self.data, 'inventory': self.inventory} - json_data = json.dumps(data, sort_keys=True, indent=2) + json_data = json.dumps(data, indent=2) - cache = open(self.cache_filename, 'w') - cache.write(json_data) - cache.close() + with open(self.cache_filename, 'w') as cache: + cache.write(json_data) ########################################################################### # Utilities ########################################################################### + @staticmethod + def to_safe(word): + """ Converts 'bad' characters in a string to underscores so they can be used as Ansible groups """ + return re.sub(r"[^A-Za-z0-9\-.]", "_", word) - def push(self, my_dict, key, element): - ''' Pushed an element onto an array that may not have been defined in the dict ''' - if key in my_dict: - my_dict[key].append(element) - else: - my_dict[key] = [element] - - def to_safe(self, word): - ''' Converts 'bad' characters in a string to underscores so they can be used as Ansible groups ''' - return re.sub("[^A-Za-z0-9\-\.]", "_", word) - - def do_namespace(self, data): - ''' Returns a copy of the dictionary with all the keys put in a 'do_' namespace ''' + @staticmethod + def do_namespace(data): + """ Returns a copy of the dictionary with all the keys put in a 'do_' namespace """ info = {} for k, v in data.items(): info['do_' + k] = v diff --git a/networks/remote/terraform/cluster/main.tf b/networks/remote/terraform/cluster/main.tf index 98ab37cee..15a913b30 100644 --- a/networks/remote/terraform/cluster/main.tf +++ b/networks/remote/terraform/cluster/main.tf @@ -1,3 +1,12 @@ +terraform { + required_providers { + digitalocean = { + source = "digitalocean/digitalocean" + version = "~> 2.0" + } + } +} + resource "digitalocean_tag" "cluster" { name = "${var.name}" } diff --git a/networks/remote/terraform/cluster/variables.tf b/networks/remote/terraform/cluster/variables.tf index 1b6a70072..0dc66fafe 100644 --- a/networks/remote/terraform/cluster/variables.tf +++ b/networks/remote/terraform/cluster/variables.tf @@ -4,13 +4,13 @@ variable "name" { variable "regions" { description = "Regions to launch in" - type = "list" + type = list default = ["AMS3", "FRA1", "LON1", "NYC3", "SFO2", "SGP1", "TOR1"] } variable "ssh_key" { description = "SSH key filename to copy to the nodes" - type = "string" + type = string } variable "instance_size" { diff --git a/networks/remote/terraform/main.tf b/networks/remote/terraform/main.tf index a768ee13a..470734694 100644 --- a/networks/remote/terraform/main.tf +++ b/networks/remote/terraform/main.tf @@ -1,5 +1,14 @@ #Terraform Configuration +terraform { + required_providers { + digitalocean = { + source = "digitalocean/digitalocean" + version = "~> 2.0" + } + } +} + variable "DO_API_TOKEN" { description = "DigitalOcean Access Token" } @@ -11,7 +20,7 @@ variable "TESTNET_NAME" { variable "SSH_KEY_FILE" { description = "SSH public key file to be used on the nodes" - type = "string" + type = string } variable "SERVERS" { diff --git a/node/node.go b/node/node.go index 0dedd2861..89b6b057e 100644 --- a/node/node.go +++ b/node/node.go @@ -28,7 +28,6 @@ import ( "github.com/tendermint/tendermint/libs/service" "github.com/tendermint/tendermint/libs/strings" tmtime "github.com/tendermint/tendermint/libs/time" - "github.com/tendermint/tendermint/light" "github.com/tendermint/tendermint/privval" tmgrpc "github.com/tendermint/tendermint/privval/grpc" "github.com/tendermint/tendermint/proxy" @@ -221,7 +220,7 @@ func makeNode(config *cfg.Config, // Determine whether we should do block sync. This must happen after the handshake, since the // app may modify the validator set, specifying ourself as the only validator. - blockSync := config.FastSyncMode && !onlyValidatorIsUs(state, pubKey) + blockSync := config.BlockSync.Enable && !onlyValidatorIsUs(state, pubKey) logNodeStartupInfo(state, pubKey, logger, consensusLogger, config.Mode) @@ -328,6 +327,8 @@ func makeNode(config *cfg.Config, } stateSyncReactor = statesync.NewReactor( + genDoc.ChainID, + genDoc.InitialHeight, *config.StateSync, stateSyncReactorShim.Logger, proxyApp.Snapshot(), @@ -335,6 +336,7 @@ func makeNode(config *cfg.Config, channels[statesync.SnapshotChannel], channels[statesync.ChunkChannel], channels[statesync.LightBlockChannel], + channels[statesync.ParamsChannel], peerUpdates, stateStore, blockStore, @@ -671,6 +673,8 @@ func (n *nodeImpl) OnStart() error { } // Run state sync + // TODO: We shouldn't run state sync if we already have state that has a + // LastBlockHeight that is not InitialHeight if n.stateSync { bcR, ok := n.bcReactor.(cs.BlockSyncReactor) if !ok { @@ -683,17 +687,52 @@ func (n *nodeImpl) OnStart() error { return fmt.Errorf("unable to derive state: %w", err) } - ssc := n.config.StateSync - sp, err := constructStateProvider(ssc, state, n.Logger.With("module", "light")) - - if err != nil { - return fmt.Errorf("failed to set up light client state provider: %w", err) + // TODO: we may want to move these events within the respective + // reactors. + // At the beginning of the statesync start, we use the initialHeight as the event height + // because of the statesync doesn't have the concreate state height before fetched the snapshot. + d := types.EventDataStateSyncStatus{Complete: false, Height: state.InitialHeight} + if err := n.eventBus.PublishEventStateSyncStatus(d); err != nil { + n.eventBus.Logger.Error("failed to emit the statesync start event", "err", err) } - if err := startStateSync(n.stateSyncReactor, bcR, n.consensusReactor, sp, - ssc, n.config.FastSyncMode, state.InitialHeight, n.eventBus); err != nil { - return fmt.Errorf("failed to start state sync: %w", err) - } + // FIXME: We shouldn't allow state sync to silently error out without + // bubbling up the error and gracefully shutting down the rest of the node + go func() { + n.Logger.Info("starting state sync") + state, err := n.stateSyncReactor.Sync(context.TODO()) + if err != nil { + n.Logger.Error("state sync failed", "err", err) + return + } + + n.consensusReactor.SetStateSyncingMetrics(0) + + d := types.EventDataStateSyncStatus{Complete: true, Height: state.LastBlockHeight} + if err := n.eventBus.PublishEventStateSyncStatus(d); err != nil { + n.eventBus.Logger.Error("failed to emit the statesync start event", "err", err) + } + + // TODO: Some form of orchestrator is needed here between the state + // advancing reactors to be able to control which one of the three + // is running + if n.config.BlockSync.Enable { + // FIXME Very ugly to have these metrics bleed through here. + n.consensusReactor.SetBlockSyncingMetrics(1) + if err := bcR.SwitchToBlockSync(state); err != nil { + n.Logger.Error("failed to switch to block sync", "err", err) + return + } + + d := types.EventDataBlockSyncStatus{Complete: false, Height: state.LastBlockHeight} + if err := n.eventBus.PublishEventBlockSyncStatus(d); err != nil { + n.eventBus.Logger.Error("failed to emit the block sync starting event", "err", err) + } + + } else { + n.consensusReactor.SwitchToConsensus(state, true) + } + }() } return nil @@ -978,67 +1017,6 @@ func (n *nodeImpl) NodeInfo() types.NodeInfo { return n.nodeInfo } -// startStateSync starts an asynchronous state sync process, then switches to block sync mode. -func startStateSync( - ssR statesync.SyncReactor, - bcR cs.BlockSyncReactor, - conR cs.ConsSyncReactor, - sp statesync.StateProvider, - config *cfg.StateSyncConfig, - blockSync bool, - stateInitHeight int64, - eb *types.EventBus, -) error { - stateSyncLogger := eb.Logger.With("module", "statesync") - - stateSyncLogger.Info("starting state sync...") - - // at the beginning of the statesync start, we use the initialHeight as the event height - // because of the statesync doesn't have the concreate state height before fetched the snapshot. - d := types.EventDataStateSyncStatus{Complete: false, Height: stateInitHeight} - if err := eb.PublishEventStateSyncStatus(d); err != nil { - stateSyncLogger.Error("failed to emit the statesync start event", "err", err) - } - - go func() { - state, err := ssR.Sync(context.TODO(), sp, config.DiscoveryTime) - if err != nil { - stateSyncLogger.Error("state sync failed", "err", err) - return - } - - if err := ssR.Backfill(state); err != nil { - stateSyncLogger.Error("backfill failed; node has insufficient history to verify all evidence;"+ - " proceeding optimistically...", "err", err) - } - - conR.SetStateSyncingMetrics(0) - - d := types.EventDataStateSyncStatus{Complete: true, Height: state.LastBlockHeight} - if err := eb.PublishEventStateSyncStatus(d); err != nil { - stateSyncLogger.Error("failed to emit the statesync start event", "err", err) - } - - if blockSync { - // FIXME Very ugly to have these metrics bleed through here. - conR.SetBlockSyncingMetrics(1) - if err := bcR.SwitchToBlockSync(state); err != nil { - stateSyncLogger.Error("failed to switch to block sync", "err", err) - return - } - - d := types.EventDataBlockSyncStatus{Complete: false, Height: state.LastBlockHeight} - if err := eb.PublishEventBlockSyncStatus(d); err != nil { - stateSyncLogger.Error("failed to emit the block sync starting event", "err", err) - } - - } else { - conR.SwitchToConsensus(state, true) - } - }() - return nil -} - // genesisDocProvider returns a GenesisDoc. // It allows the GenesisDoc to be pulled from sources other than the // filesystem, for instance from a distributed key-value store cluster. @@ -1221,24 +1199,3 @@ func getChannelsFromShim(reactorShim *p2p.ReactorShim) map[p2p.ChannelID]*p2p.Ch return channels } - -func constructStateProvider( - ssc *cfg.StateSyncConfig, - state sm.State, - logger log.Logger, -) (statesync.StateProvider, error) { - ctx, cancel := context.WithTimeout(context.TODO(), 10*time.Second) - defer cancel() - - to := light.TrustOptions{ - Period: ssc.TrustPeriod, - Height: ssc.TrustHeight, - Hash: ssc.TrustHashBytes(), - } - - return statesync.NewLightClientStateProvider( - ctx, - state.ChainID, state.Version, state.InitialHeight, - ssc.RPCServers, to, logger, - ) -} diff --git a/node/node_test.go b/node/node_test.go index 885bddcfc..6925008a6 100644 --- a/node/node_test.go +++ b/node/node_test.go @@ -21,16 +21,12 @@ import ( "github.com/tendermint/tendermint/crypto" "github.com/tendermint/tendermint/crypto/ed25519" "github.com/tendermint/tendermint/crypto/tmhash" - consmocks "github.com/tendermint/tendermint/internal/consensus/mocks" - ssmocks "github.com/tendermint/tendermint/internal/statesync/mocks" "github.com/tendermint/tendermint/internal/evidence" "github.com/tendermint/tendermint/internal/mempool" mempoolv0 "github.com/tendermint/tendermint/internal/mempool/v0" - statesync "github.com/tendermint/tendermint/internal/statesync" "github.com/tendermint/tendermint/internal/test/factory" "github.com/tendermint/tendermint/libs/log" - tmpubsub "github.com/tendermint/tendermint/libs/pubsub" tmrand "github.com/tendermint/tendermint/libs/rand" tmtime "github.com/tendermint/tendermint/libs/time" "github.com/tendermint/tendermint/privval" @@ -669,65 +665,3 @@ func loadStatefromGenesis(t *testing.T) sm.State { return state } - -func TestNodeStartStateSync(t *testing.T) { - mockSSR := &statesync.MockSyncReactor{} - mockFSR := &consmocks.BlockSyncReactor{} - mockCSR := &consmocks.ConsSyncReactor{} - mockSP := &ssmocks.StateProvider{} - state := loadStatefromGenesis(t) - config := cfg.ResetTestRoot("load_state_from_genesis") - - eventBus, err := createAndStartEventBus(log.TestingLogger()) - defer func() { - err := eventBus.Stop() - require.NoError(t, err) - }() - - require.NoError(t, err) - require.NotNil(t, eventBus) - - sub, err := eventBus.Subscribe(context.Background(), "test-client", types.EventQueryStateSyncStatus, 10) - require.NoError(t, err) - require.NotNil(t, sub) - - cfgSS := config.StateSync - - mockSSR.On("Sync", context.TODO(), mockSP, cfgSS.DiscoveryTime).Return(state, nil). - On("Backfill", state).Return(nil) - mockCSR.On("SetStateSyncingMetrics", float64(0)).Return(). - On("SwitchToConsensus", state, true).Return() - - require.NoError(t, - startStateSync(mockSSR, mockFSR, mockCSR, mockSP, config.StateSync, false, state.InitialHeight, eventBus)) - - for cnt := 0; cnt < 2; { - select { - case <-time.After(3 * time.Second): - t.Errorf("StateSyncStatus timeout") - case msg := <-sub.Out(): - if cnt == 0 { - ensureStateSyncStatus(t, msg, false, state.InitialHeight) - cnt++ - } else { - // the state height = 0 because we are not actually update the state in this test - ensureStateSyncStatus(t, msg, true, 0) - cnt++ - } - } - } - - mockSSR.AssertNumberOfCalls(t, "Sync", 1) - mockSSR.AssertNumberOfCalls(t, "Backfill", 1) - mockCSR.AssertNumberOfCalls(t, "SetStateSyncingMetrics", 1) - mockCSR.AssertNumberOfCalls(t, "SwitchToConsensus", 1) -} - -func ensureStateSyncStatus(t *testing.T, msg tmpubsub.Message, complete bool, height int64) { - t.Helper() - status, ok := msg.Data().(types.EventDataStateSyncStatus) - - require.True(t, ok) - require.Equal(t, complete, status.Complete) - require.Equal(t, height, status.Height) -} diff --git a/node/setup.go b/node/setup.go index 00f8051f0..1a7c1b3b2 100644 --- a/node/setup.go +++ b/node/setup.go @@ -706,6 +706,7 @@ func makeNodeInfo( byte(statesync.SnapshotChannel), byte(statesync.ChunkChannel), byte(statesync.LightBlockChannel), + byte(statesync.ParamsChannel), }, Moniker: config.Moniker, Other: types.NodeInfoOther{ diff --git a/proto/tendermint/statesync/message.go b/proto/tendermint/statesync/message.go index 6f9b6ad59..992cd7525 100644 --- a/proto/tendermint/statesync/message.go +++ b/proto/tendermint/statesync/message.go @@ -28,6 +28,12 @@ func (m *Message) Wrap(pb proto.Message) error { case *LightBlockResponse: m.Sum = &Message_LightBlockResponse{LightBlockResponse: msg} + case *ParamsRequest: + m.Sum = &Message_ParamsRequest{ParamsRequest: msg} + + case *ParamsResponse: + m.Sum = &Message_ParamsResponse{ParamsResponse: msg} + default: return fmt.Errorf("unknown message: %T", msg) } @@ -57,6 +63,12 @@ func (m *Message) Unwrap() (proto.Message, error) { case *Message_LightBlockResponse: return m.GetLightBlockResponse(), nil + case *Message_ParamsRequest: + return m.GetParamsRequest(), nil + + case *Message_ParamsResponse: + return m.GetParamsResponse(), nil + default: return nil, fmt.Errorf("unknown message: %T", msg) } @@ -106,6 +118,17 @@ func (m *Message) Validate() error { // light block validation handled by the backfill process case *Message_LightBlockResponse: + case *Message_ParamsRequest: + if m.GetParamsRequest().Height == 0 { + return errors.New("height cannot be 0") + } + + case *Message_ParamsResponse: + resp := m.GetParamsResponse() + if resp.Height == 0 { + return errors.New("height cannot be 0") + } + default: return fmt.Errorf("unknown message type: %T", msg) } diff --git a/proto/tendermint/statesync/message_test.go b/proto/tendermint/statesync/message_test.go index dcf089130..40428ec07 100644 --- a/proto/tendermint/statesync/message_test.go +++ b/proto/tendermint/statesync/message_test.go @@ -9,6 +9,7 @@ import ( ssproto "github.com/tendermint/tendermint/proto/tendermint/statesync" tmproto "github.com/tendermint/tendermint/proto/tendermint/types" + "github.com/tendermint/tendermint/types" ) func TestValidateMsg(t *testing.T) { @@ -161,6 +162,35 @@ func TestStateSyncVectors(t *testing.T) { }, "2214080110021803220c697427732061206368756e6b", }, + { + "LightBlockRequest", + &ssproto.LightBlockRequest{ + Height: 100, + }, + "2a020864", + }, + { + "LightBlockResponse", + &ssproto.LightBlockResponse{ + LightBlock: nil, + }, + "3200", + }, + { + "ParamsRequest", + &ssproto.ParamsRequest{ + Height: 9001, + }, + "3a0308a946", + }, + { + "ParamsResponse", + &ssproto.ParamsResponse{ + Height: 9001, + ConsensusParams: types.DefaultConsensusParams().ToProto(), + }, + "423408a946122f0a10088080c00a10ffffffffffffffffff01120e08a08d0612040880c60a188080401a090a07656432353531392200", + }, } for _, tc := range testCases { diff --git a/proto/tendermint/statesync/types.pb.go b/proto/tendermint/statesync/types.pb.go index f5eab7a33..5541c2803 100644 --- a/proto/tendermint/statesync/types.pb.go +++ b/proto/tendermint/statesync/types.pb.go @@ -5,6 +5,7 @@ package statesync import ( fmt "fmt" + _ "github.com/gogo/protobuf/gogoproto" proto "github.com/gogo/protobuf/proto" types "github.com/tendermint/tendermint/proto/tendermint/types" io "io" @@ -31,6 +32,8 @@ type Message struct { // *Message_ChunkResponse // *Message_LightBlockRequest // *Message_LightBlockResponse + // *Message_ParamsRequest + // *Message_ParamsResponse Sum isMessage_Sum `protobuf_oneof:"sum"` } @@ -91,6 +94,12 @@ type Message_LightBlockRequest struct { type Message_LightBlockResponse struct { LightBlockResponse *LightBlockResponse `protobuf:"bytes,6,opt,name=light_block_response,json=lightBlockResponse,proto3,oneof" json:"light_block_response,omitempty"` } +type Message_ParamsRequest struct { + ParamsRequest *ParamsRequest `protobuf:"bytes,7,opt,name=params_request,json=paramsRequest,proto3,oneof" json:"params_request,omitempty"` +} +type Message_ParamsResponse struct { + ParamsResponse *ParamsResponse `protobuf:"bytes,8,opt,name=params_response,json=paramsResponse,proto3,oneof" json:"params_response,omitempty"` +} func (*Message_SnapshotsRequest) isMessage_Sum() {} func (*Message_SnapshotsResponse) isMessage_Sum() {} @@ -98,6 +107,8 @@ func (*Message_ChunkRequest) isMessage_Sum() {} func (*Message_ChunkResponse) isMessage_Sum() {} func (*Message_LightBlockRequest) isMessage_Sum() {} func (*Message_LightBlockResponse) isMessage_Sum() {} +func (*Message_ParamsRequest) isMessage_Sum() {} +func (*Message_ParamsResponse) isMessage_Sum() {} func (m *Message) GetSum() isMessage_Sum { if m != nil { @@ -148,6 +159,20 @@ func (m *Message) GetLightBlockResponse() *LightBlockResponse { return nil } +func (m *Message) GetParamsRequest() *ParamsRequest { + if x, ok := m.GetSum().(*Message_ParamsRequest); ok { + return x.ParamsRequest + } + return nil +} + +func (m *Message) GetParamsResponse() *ParamsResponse { + if x, ok := m.GetSum().(*Message_ParamsResponse); ok { + return x.ParamsResponse + } + return nil +} + // XXX_OneofWrappers is for the internal use of the proto package. func (*Message) XXX_OneofWrappers() []interface{} { return []interface{}{ @@ -157,6 +182,8 @@ func (*Message) XXX_OneofWrappers() []interface{} { (*Message_ChunkResponse)(nil), (*Message_LightBlockRequest)(nil), (*Message_LightBlockResponse)(nil), + (*Message_ParamsRequest)(nil), + (*Message_ParamsResponse)(nil), } } @@ -496,6 +523,102 @@ func (m *LightBlockResponse) GetLightBlock() *types.LightBlock { return nil } +type ParamsRequest struct { + Height uint64 `protobuf:"varint,1,opt,name=height,proto3" json:"height,omitempty"` +} + +func (m *ParamsRequest) Reset() { *m = ParamsRequest{} } +func (m *ParamsRequest) String() string { return proto.CompactTextString(m) } +func (*ParamsRequest) ProtoMessage() {} +func (*ParamsRequest) Descriptor() ([]byte, []int) { + return fileDescriptor_a1c2869546ca7914, []int{7} +} +func (m *ParamsRequest) XXX_Unmarshal(b []byte) error { + return m.Unmarshal(b) +} +func (m *ParamsRequest) XXX_Marshal(b []byte, deterministic bool) ([]byte, error) { + if deterministic { + return xxx_messageInfo_ParamsRequest.Marshal(b, m, deterministic) + } else { + b = b[:cap(b)] + n, err := m.MarshalToSizedBuffer(b) + if err != nil { + return nil, err + } + return b[:n], nil + } +} +func (m *ParamsRequest) XXX_Merge(src proto.Message) { + xxx_messageInfo_ParamsRequest.Merge(m, src) +} +func (m *ParamsRequest) XXX_Size() int { + return m.Size() +} +func (m *ParamsRequest) XXX_DiscardUnknown() { + xxx_messageInfo_ParamsRequest.DiscardUnknown(m) +} + +var xxx_messageInfo_ParamsRequest proto.InternalMessageInfo + +func (m *ParamsRequest) GetHeight() uint64 { + if m != nil { + return m.Height + } + return 0 +} + +type ParamsResponse struct { + Height uint64 `protobuf:"varint,1,opt,name=height,proto3" json:"height,omitempty"` + ConsensusParams types.ConsensusParams `protobuf:"bytes,2,opt,name=consensus_params,json=consensusParams,proto3" json:"consensus_params"` +} + +func (m *ParamsResponse) Reset() { *m = ParamsResponse{} } +func (m *ParamsResponse) String() string { return proto.CompactTextString(m) } +func (*ParamsResponse) ProtoMessage() {} +func (*ParamsResponse) Descriptor() ([]byte, []int) { + return fileDescriptor_a1c2869546ca7914, []int{8} +} +func (m *ParamsResponse) XXX_Unmarshal(b []byte) error { + return m.Unmarshal(b) +} +func (m *ParamsResponse) XXX_Marshal(b []byte, deterministic bool) ([]byte, error) { + if deterministic { + return xxx_messageInfo_ParamsResponse.Marshal(b, m, deterministic) + } else { + b = b[:cap(b)] + n, err := m.MarshalToSizedBuffer(b) + if err != nil { + return nil, err + } + return b[:n], nil + } +} +func (m *ParamsResponse) XXX_Merge(src proto.Message) { + xxx_messageInfo_ParamsResponse.Merge(m, src) +} +func (m *ParamsResponse) XXX_Size() int { + return m.Size() +} +func (m *ParamsResponse) XXX_DiscardUnknown() { + xxx_messageInfo_ParamsResponse.DiscardUnknown(m) +} + +var xxx_messageInfo_ParamsResponse proto.InternalMessageInfo + +func (m *ParamsResponse) GetHeight() uint64 { + if m != nil { + return m.Height + } + return 0 +} + +func (m *ParamsResponse) GetConsensusParams() types.ConsensusParams { + if m != nil { + return m.ConsensusParams + } + return types.ConsensusParams{} +} + func init() { proto.RegisterType((*Message)(nil), "tendermint.statesync.Message") proto.RegisterType((*SnapshotsRequest)(nil), "tendermint.statesync.SnapshotsRequest") @@ -504,43 +627,51 @@ func init() { proto.RegisterType((*ChunkResponse)(nil), "tendermint.statesync.ChunkResponse") proto.RegisterType((*LightBlockRequest)(nil), "tendermint.statesync.LightBlockRequest") proto.RegisterType((*LightBlockResponse)(nil), "tendermint.statesync.LightBlockResponse") + proto.RegisterType((*ParamsRequest)(nil), "tendermint.statesync.ParamsRequest") + proto.RegisterType((*ParamsResponse)(nil), "tendermint.statesync.ParamsResponse") } func init() { proto.RegisterFile("tendermint/statesync/types.proto", fileDescriptor_a1c2869546ca7914) } var fileDescriptor_a1c2869546ca7914 = []byte{ - // 485 bytes of a gzipped FileDescriptorProto - 0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0xff, 0xac, 0x54, 0x51, 0x6b, 0xd3, 0x50, - 0x14, 0x4e, 0x5c, 0xdb, 0x8d, 0xb3, 0x46, 0x96, 0x63, 0x91, 0x32, 0x46, 0x18, 0x11, 0x74, 0x20, - 0xa4, 0xa0, 0x8f, 0xe2, 0x4b, 0x7d, 0x99, 0x30, 0x5f, 0xee, 0x1c, 0xa8, 0x08, 0x23, 0x4d, 0xaf, - 0x4d, 0xb0, 0x49, 0x6a, 0xcf, 0x2d, 0xb8, 0x1f, 0xe0, 0x93, 0x2f, 0x82, 0x7f, 0xca, 0xc7, 0x3d, - 0xfa, 0x28, 0xed, 0x1f, 0x91, 0x9c, 0xdc, 0x26, 0x77, 0x6d, 0x5d, 0x11, 0xf6, 0x96, 0xef, 0xeb, - 0x77, 0x3e, 0xbe, 0x73, 0xcf, 0xe9, 0x81, 0x63, 0x25, 0xb3, 0xa1, 0x9c, 0xa6, 0x49, 0xa6, 0x7a, - 0xa4, 0x42, 0x25, 0xe9, 0x2a, 0x8b, 0x7a, 0xea, 0x6a, 0x22, 0x29, 0x98, 0x4c, 0x73, 0x95, 0x63, - 0xa7, 0x56, 0x04, 0x95, 0xe2, 0xf0, 0xc8, 0xa8, 0x63, 0xb5, 0x59, 0xe3, 0xff, 0x6c, 0xc0, 0xee, - 0x1b, 0x49, 0x14, 0x8e, 0x24, 0x5e, 0x80, 0x4b, 0x59, 0x38, 0xa1, 0x38, 0x57, 0x74, 0x39, 0x95, - 0x5f, 0x66, 0x92, 0x54, 0xd7, 0x3e, 0xb6, 0x4f, 0xf6, 0x9f, 0x3d, 0x0e, 0x36, 0x79, 0x07, 0xe7, - 0x4b, 0xb9, 0x28, 0xd5, 0xa7, 0x96, 0x38, 0xa0, 0x15, 0x0e, 0xdf, 0x01, 0x9a, 0xb6, 0x34, 0xc9, - 0x33, 0x92, 0xdd, 0x7b, 0xec, 0xfb, 0x64, 0xab, 0x6f, 0x29, 0x3f, 0xb5, 0x84, 0x4b, 0xab, 0x24, - 0xbe, 0x06, 0x27, 0x8a, 0x67, 0xd9, 0xe7, 0x2a, 0xec, 0x0e, 0x9b, 0xfa, 0x9b, 0x4d, 0x5f, 0x15, - 0xd2, 0x3a, 0x68, 0x3b, 0x32, 0x30, 0x9e, 0xc1, 0xfd, 0xa5, 0x95, 0x0e, 0xd8, 0x60, 0xaf, 0x47, - 0xb7, 0x7a, 0x55, 0xe1, 0x9c, 0xc8, 0x24, 0xf0, 0x3d, 0x3c, 0x18, 0x27, 0xa3, 0x58, 0x5d, 0x0e, - 0xc6, 0x79, 0x54, 0xc7, 0x6b, 0xde, 0xd6, 0xf3, 0x59, 0x51, 0xd0, 0x2f, 0xf4, 0x75, 0x46, 0x77, - 0xbc, 0x4a, 0xe2, 0x47, 0xe8, 0xdc, 0xb4, 0xd6, 0x71, 0x5b, 0xec, 0x7d, 0xb2, 0xdd, 0xbb, 0xca, - 0x8c, 0xe3, 0x35, 0xb6, 0xdf, 0x84, 0x1d, 0x9a, 0xa5, 0x3e, 0xc2, 0xc1, 0xea, 0x68, 0xfd, 0xef, - 0x36, 0xb8, 0x6b, 0x73, 0xc1, 0x87, 0xd0, 0x8a, 0x65, 0xe1, 0xc3, 0x8b, 0xd2, 0x10, 0x1a, 0x15, - 0xfc, 0xa7, 0x7c, 0x9a, 0x86, 0x8a, 0x07, 0xed, 0x08, 0x8d, 0x0a, 0x9e, 0x9f, 0x8a, 0x78, 0x56, - 0x8e, 0xd0, 0x08, 0x11, 0x1a, 0x71, 0x48, 0x31, 0xbf, 0x7a, 0x5b, 0xf0, 0x37, 0x1e, 0xc2, 0x5e, - 0x2a, 0x55, 0x38, 0x0c, 0x55, 0xc8, 0x4f, 0xd7, 0x16, 0x15, 0xf6, 0xdf, 0x42, 0xdb, 0x9c, 0xe7, - 0x7f, 0xe7, 0xe8, 0x40, 0x33, 0xc9, 0x86, 0xf2, 0xab, 0x8e, 0x51, 0x02, 0xff, 0x9b, 0x0d, 0xce, - 0x8d, 0xd1, 0xde, 0x8d, 0x6f, 0xc1, 0x72, 0x9f, 0xba, 0xbd, 0x12, 0x60, 0x17, 0x76, 0xd3, 0x84, - 0x28, 0xc9, 0x46, 0xdc, 0xde, 0x9e, 0x58, 0x42, 0xff, 0x29, 0xb8, 0x6b, 0xeb, 0xf0, 0xaf, 0x28, - 0xfe, 0x39, 0xe0, 0xfa, 0x7c, 0xf1, 0x25, 0xec, 0x1b, 0x7b, 0xa2, 0xff, 0xc6, 0x47, 0xe6, 0x7a, - 0x94, 0x67, 0xc0, 0x28, 0x85, 0x7a, 0x21, 0xfa, 0x17, 0xbf, 0xe6, 0x9e, 0x7d, 0x3d, 0xf7, 0xec, - 0x3f, 0x73, 0xcf, 0xfe, 0xb1, 0xf0, 0xac, 0xeb, 0x85, 0x67, 0xfd, 0x5e, 0x78, 0xd6, 0x87, 0x17, - 0xa3, 0x44, 0xc5, 0xb3, 0x41, 0x10, 0xe5, 0x69, 0xcf, 0x3c, 0x2d, 0xf5, 0x27, 0x5f, 0x96, 0xde, - 0xa6, 0x73, 0x35, 0x68, 0xf1, 0x6f, 0xcf, 0xff, 0x06, 0x00, 0x00, 0xff, 0xff, 0xc1, 0x45, 0x35, - 0xee, 0xcd, 0x04, 0x00, 0x00, + // 589 bytes of a gzipped FileDescriptorProto + 0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0xff, 0xac, 0x95, 0x4f, 0x8b, 0xd3, 0x40, + 0x18, 0xc6, 0x13, 0xb7, 0xdd, 0x96, 0x77, 0x9b, 0x6e, 0x3b, 0x16, 0x29, 0x65, 0x8d, 0x6b, 0x14, + 0x77, 0x41, 0x68, 0x41, 0x8f, 0xe2, 0xa5, 0x7b, 0x59, 0x61, 0x45, 0x99, 0x75, 0x41, 0x45, 0x28, + 0x69, 0x3a, 0x26, 0xc1, 0xe6, 0x8f, 0x7d, 0xa7, 0xe0, 0x82, 0x57, 0x4f, 0x5e, 0xfc, 0x2c, 0x7e, + 0x8a, 0x3d, 0xee, 0xd1, 0x93, 0x48, 0xfb, 0x45, 0x24, 0x93, 0x69, 0x32, 0x6d, 0xda, 0x2e, 0x82, + 0xb7, 0xbc, 0xcf, 0x3c, 0xf9, 0xf5, 0x99, 0xc9, 0xc3, 0x14, 0x0e, 0x39, 0x0b, 0x47, 0x6c, 0x12, + 0xf8, 0x21, 0xef, 0x21, 0xb7, 0x39, 0xc3, 0xcb, 0xd0, 0xe9, 0xf1, 0xcb, 0x98, 0x61, 0x37, 0x9e, + 0x44, 0x3c, 0x22, 0xad, 0xdc, 0xd1, 0xcd, 0x1c, 0x9d, 0x96, 0x1b, 0xb9, 0x91, 0x30, 0xf4, 0x92, + 0xa7, 0xd4, 0xdb, 0x39, 0x50, 0x68, 0x82, 0xa1, 0x92, 0x3a, 0x77, 0x0b, 0xab, 0xb1, 0x3d, 0xb1, + 0x03, 0xb9, 0x6c, 0xfd, 0x2c, 0x43, 0xe5, 0x25, 0x43, 0xb4, 0x5d, 0x46, 0x2e, 0xa0, 0x89, 0xa1, + 0x1d, 0xa3, 0x17, 0x71, 0x1c, 0x4c, 0xd8, 0xe7, 0x29, 0x43, 0xde, 0xd6, 0x0f, 0xf5, 0xe3, 0xbd, + 0x27, 0x8f, 0xba, 0xeb, 0x02, 0x75, 0xcf, 0x17, 0x76, 0x9a, 0xba, 0x4f, 0x35, 0xda, 0xc0, 0x15, + 0x8d, 0xbc, 0x05, 0xa2, 0x62, 0x31, 0x8e, 0x42, 0x64, 0xed, 0x5b, 0x82, 0x7b, 0x74, 0x23, 0x37, + 0xb5, 0x9f, 0x6a, 0xb4, 0x89, 0xab, 0x22, 0x79, 0x01, 0x86, 0xe3, 0x4d, 0xc3, 0x4f, 0x59, 0xd8, + 0x1d, 0x01, 0xb5, 0xd6, 0x43, 0x4f, 0x12, 0x6b, 0x1e, 0xb4, 0xe6, 0x28, 0x33, 0x39, 0x83, 0xfa, + 0x02, 0x25, 0x03, 0x96, 0x04, 0xeb, 0xc1, 0x56, 0x56, 0x16, 0xce, 0x70, 0x54, 0x81, 0xbc, 0x83, + 0xdb, 0x63, 0xdf, 0xf5, 0xf8, 0x60, 0x38, 0x8e, 0x9c, 0x3c, 0x5e, 0x79, 0xdb, 0x9e, 0xcf, 0x92, + 0x17, 0xfa, 0x89, 0x3f, 0xcf, 0xd8, 0x1c, 0xaf, 0x8a, 0xe4, 0x03, 0xb4, 0x96, 0xd1, 0x32, 0xee, + 0xae, 0x60, 0x1f, 0xdf, 0xcc, 0xce, 0x32, 0x93, 0x71, 0x41, 0x4d, 0x8e, 0x21, 0xad, 0x47, 0x96, + 0xb9, 0xb2, 0xed, 0x18, 0x5e, 0x0b, 0x6f, 0x9e, 0xd7, 0x88, 0x55, 0x81, 0xbc, 0x82, 0xfd, 0x8c, + 0x26, 0x63, 0x56, 0x05, 0xee, 0xe1, 0x76, 0x5c, 0x16, 0xb1, 0x1e, 0x2f, 0x29, 0xfd, 0x32, 0xec, + 0xe0, 0x34, 0xb0, 0x08, 0x34, 0x56, 0x9b, 0x67, 0x7d, 0xd7, 0xa1, 0x59, 0xa8, 0x0d, 0xb9, 0x03, + 0xbb, 0x1e, 0x4b, 0xb6, 0x29, 0x7a, 0x5c, 0xa2, 0x72, 0x4a, 0xf4, 0x8f, 0xd1, 0x24, 0xb0, 0xb9, + 0xe8, 0xa1, 0x41, 0xe5, 0x94, 0xe8, 0xe2, 0x4b, 0xa2, 0xa8, 0x92, 0x41, 0xe5, 0x44, 0x08, 0x94, + 0x3c, 0x1b, 0x3d, 0x51, 0x8a, 0x1a, 0x15, 0xcf, 0xa4, 0x03, 0xd5, 0x80, 0x71, 0x7b, 0x64, 0x73, + 0x5b, 0x7c, 0xd9, 0x1a, 0xcd, 0x66, 0xeb, 0x0d, 0xd4, 0xd4, 0xba, 0xfd, 0x73, 0x8e, 0x16, 0x94, + 0xfd, 0x70, 0xc4, 0xbe, 0xc8, 0x18, 0xe9, 0x60, 0x7d, 0xd3, 0xc1, 0x58, 0x6a, 0xde, 0xff, 0xe1, + 0x26, 0xaa, 0xd8, 0xa7, 0xdc, 0x5e, 0x3a, 0x90, 0x36, 0x54, 0x02, 0x1f, 0xd1, 0x0f, 0x5d, 0xb1, + 0xbd, 0x2a, 0x5d, 0x8c, 0xd6, 0x63, 0x68, 0x16, 0xda, 0xba, 0x29, 0x8a, 0x75, 0x0e, 0xa4, 0x58, + 0x3f, 0xf2, 0x1c, 0xf6, 0x94, 0x1a, 0xcb, 0x5b, 0xe6, 0x40, 0xad, 0x45, 0x7a, 0x89, 0x29, 0xaf, + 0x42, 0xde, 0x57, 0xeb, 0x08, 0x8c, 0xa5, 0xee, 0x6d, 0xfc, 0xf5, 0xaf, 0x50, 0x5f, 0x6e, 0xd5, + 0xc6, 0x23, 0xa3, 0xd0, 0x70, 0x12, 0x43, 0x88, 0x53, 0x1c, 0xa4, 0xbd, 0x93, 0x97, 0xd4, 0xfd, + 0x62, 0xac, 0x93, 0x85, 0x33, 0x85, 0xf7, 0x4b, 0x57, 0xbf, 0xef, 0x69, 0x74, 0xdf, 0x59, 0x91, + 0x2f, 0xae, 0x66, 0xa6, 0x7e, 0x3d, 0x33, 0xf5, 0x3f, 0x33, 0x53, 0xff, 0x31, 0x37, 0xb5, 0xeb, + 0xb9, 0xa9, 0xfd, 0x9a, 0x9b, 0xda, 0xfb, 0x67, 0xae, 0xcf, 0xbd, 0xe9, 0xb0, 0xeb, 0x44, 0x41, + 0x4f, 0xbd, 0xa1, 0xf3, 0xc7, 0xf4, 0x9e, 0x5f, 0xf7, 0x4f, 0x31, 0xdc, 0x15, 0x6b, 0x4f, 0xff, + 0x06, 0x00, 0x00, 0xff, 0xff, 0xa1, 0xb2, 0xfd, 0x65, 0x48, 0x06, 0x00, 0x00, } func (m *Message) Marshal() (dAtA []byte, err error) { @@ -701,6 +832,48 @@ func (m *Message_LightBlockResponse) MarshalToSizedBuffer(dAtA []byte) (int, err } return len(dAtA) - i, nil } +func (m *Message_ParamsRequest) MarshalTo(dAtA []byte) (int, error) { + size := m.Size() + return m.MarshalToSizedBuffer(dAtA[:size]) +} + +func (m *Message_ParamsRequest) MarshalToSizedBuffer(dAtA []byte) (int, error) { + i := len(dAtA) + if m.ParamsRequest != nil { + { + size, err := m.ParamsRequest.MarshalToSizedBuffer(dAtA[:i]) + if err != nil { + return 0, err + } + i -= size + i = encodeVarintTypes(dAtA, i, uint64(size)) + } + i-- + dAtA[i] = 0x3a + } + return len(dAtA) - i, nil +} +func (m *Message_ParamsResponse) MarshalTo(dAtA []byte) (int, error) { + size := m.Size() + return m.MarshalToSizedBuffer(dAtA[:size]) +} + +func (m *Message_ParamsResponse) MarshalToSizedBuffer(dAtA []byte) (int, error) { + i := len(dAtA) + if m.ParamsResponse != nil { + { + size, err := m.ParamsResponse.MarshalToSizedBuffer(dAtA[:i]) + if err != nil { + return 0, err + } + i -= size + i = encodeVarintTypes(dAtA, i, uint64(size)) + } + i-- + dAtA[i] = 0x42 + } + return len(dAtA) - i, nil +} func (m *SnapshotsRequest) Marshal() (dAtA []byte, err error) { size := m.Size() dAtA = make([]byte, size) @@ -932,6 +1105,72 @@ func (m *LightBlockResponse) MarshalToSizedBuffer(dAtA []byte) (int, error) { return len(dAtA) - i, nil } +func (m *ParamsRequest) Marshal() (dAtA []byte, err error) { + size := m.Size() + dAtA = make([]byte, size) + n, err := m.MarshalToSizedBuffer(dAtA[:size]) + if err != nil { + return nil, err + } + return dAtA[:n], nil +} + +func (m *ParamsRequest) MarshalTo(dAtA []byte) (int, error) { + size := m.Size() + return m.MarshalToSizedBuffer(dAtA[:size]) +} + +func (m *ParamsRequest) MarshalToSizedBuffer(dAtA []byte) (int, error) { + i := len(dAtA) + _ = i + var l int + _ = l + if m.Height != 0 { + i = encodeVarintTypes(dAtA, i, uint64(m.Height)) + i-- + dAtA[i] = 0x8 + } + return len(dAtA) - i, nil +} + +func (m *ParamsResponse) Marshal() (dAtA []byte, err error) { + size := m.Size() + dAtA = make([]byte, size) + n, err := m.MarshalToSizedBuffer(dAtA[:size]) + if err != nil { + return nil, err + } + return dAtA[:n], nil +} + +func (m *ParamsResponse) MarshalTo(dAtA []byte) (int, error) { + size := m.Size() + return m.MarshalToSizedBuffer(dAtA[:size]) +} + +func (m *ParamsResponse) MarshalToSizedBuffer(dAtA []byte) (int, error) { + i := len(dAtA) + _ = i + var l int + _ = l + { + size, err := m.ConsensusParams.MarshalToSizedBuffer(dAtA[:i]) + if err != nil { + return 0, err + } + i -= size + i = encodeVarintTypes(dAtA, i, uint64(size)) + } + i-- + dAtA[i] = 0x12 + if m.Height != 0 { + i = encodeVarintTypes(dAtA, i, uint64(m.Height)) + i-- + dAtA[i] = 0x8 + } + return len(dAtA) - i, nil +} + func encodeVarintTypes(dAtA []byte, offset int, v uint64) int { offset -= sovTypes(v) base := offset @@ -1027,6 +1266,30 @@ func (m *Message_LightBlockResponse) Size() (n int) { } return n } +func (m *Message_ParamsRequest) Size() (n int) { + if m == nil { + return 0 + } + var l int + _ = l + if m.ParamsRequest != nil { + l = m.ParamsRequest.Size() + n += 1 + l + sovTypes(uint64(l)) + } + return n +} +func (m *Message_ParamsResponse) Size() (n int) { + if m == nil { + return 0 + } + var l int + _ = l + if m.ParamsResponse != nil { + l = m.ParamsResponse.Size() + n += 1 + l + sovTypes(uint64(l)) + } + return n +} func (m *SnapshotsRequest) Size() (n int) { if m == nil { return 0 @@ -1130,6 +1393,32 @@ func (m *LightBlockResponse) Size() (n int) { return n } +func (m *ParamsRequest) Size() (n int) { + if m == nil { + return 0 + } + var l int + _ = l + if m.Height != 0 { + n += 1 + sovTypes(uint64(m.Height)) + } + return n +} + +func (m *ParamsResponse) Size() (n int) { + if m == nil { + return 0 + } + var l int + _ = l + if m.Height != 0 { + n += 1 + sovTypes(uint64(m.Height)) + } + l = m.ConsensusParams.Size() + n += 1 + l + sovTypes(uint64(l)) + return n +} + func sovTypes(x uint64) (n int) { return (math_bits.Len64(x|1) + 6) / 7 } @@ -1375,6 +1664,76 @@ func (m *Message) Unmarshal(dAtA []byte) error { } m.Sum = &Message_LightBlockResponse{v} iNdEx = postIndex + case 7: + if wireType != 2 { + return fmt.Errorf("proto: wrong wireType = %d for field ParamsRequest", wireType) + } + var msglen int + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowTypes + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := dAtA[iNdEx] + iNdEx++ + msglen |= int(b&0x7F) << shift + if b < 0x80 { + break + } + } + if msglen < 0 { + return ErrInvalidLengthTypes + } + postIndex := iNdEx + msglen + if postIndex < 0 { + return ErrInvalidLengthTypes + } + if postIndex > l { + return io.ErrUnexpectedEOF + } + v := &ParamsRequest{} + if err := v.Unmarshal(dAtA[iNdEx:postIndex]); err != nil { + return err + } + m.Sum = &Message_ParamsRequest{v} + iNdEx = postIndex + case 8: + if wireType != 2 { + return fmt.Errorf("proto: wrong wireType = %d for field ParamsResponse", wireType) + } + var msglen int + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowTypes + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := dAtA[iNdEx] + iNdEx++ + msglen |= int(b&0x7F) << shift + if b < 0x80 { + break + } + } + if msglen < 0 { + return ErrInvalidLengthTypes + } + postIndex := iNdEx + msglen + if postIndex < 0 { + return ErrInvalidLengthTypes + } + if postIndex > l { + return io.ErrUnexpectedEOF + } + v := &ParamsResponse{} + if err := v.Unmarshal(dAtA[iNdEx:postIndex]); err != nil { + return err + } + m.Sum = &Message_ParamsResponse{v} + iNdEx = postIndex default: iNdEx = preIndex skippy, err := skipTypes(dAtA[iNdEx:]) @@ -2044,6 +2403,177 @@ func (m *LightBlockResponse) Unmarshal(dAtA []byte) error { } return nil } +func (m *ParamsRequest) Unmarshal(dAtA []byte) error { + l := len(dAtA) + iNdEx := 0 + for iNdEx < l { + preIndex := iNdEx + var wire uint64 + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowTypes + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := dAtA[iNdEx] + iNdEx++ + wire |= uint64(b&0x7F) << shift + if b < 0x80 { + break + } + } + fieldNum := int32(wire >> 3) + wireType := int(wire & 0x7) + if wireType == 4 { + return fmt.Errorf("proto: ParamsRequest: wiretype end group for non-group") + } + if fieldNum <= 0 { + return fmt.Errorf("proto: ParamsRequest: illegal tag %d (wire type %d)", fieldNum, wire) + } + switch fieldNum { + case 1: + if wireType != 0 { + return fmt.Errorf("proto: wrong wireType = %d for field Height", wireType) + } + m.Height = 0 + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowTypes + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := dAtA[iNdEx] + iNdEx++ + m.Height |= uint64(b&0x7F) << shift + if b < 0x80 { + break + } + } + default: + iNdEx = preIndex + skippy, err := skipTypes(dAtA[iNdEx:]) + if err != nil { + return err + } + if (skippy < 0) || (iNdEx+skippy) < 0 { + return ErrInvalidLengthTypes + } + if (iNdEx + skippy) > l { + return io.ErrUnexpectedEOF + } + iNdEx += skippy + } + } + + if iNdEx > l { + return io.ErrUnexpectedEOF + } + return nil +} +func (m *ParamsResponse) Unmarshal(dAtA []byte) error { + l := len(dAtA) + iNdEx := 0 + for iNdEx < l { + preIndex := iNdEx + var wire uint64 + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowTypes + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := dAtA[iNdEx] + iNdEx++ + wire |= uint64(b&0x7F) << shift + if b < 0x80 { + break + } + } + fieldNum := int32(wire >> 3) + wireType := int(wire & 0x7) + if wireType == 4 { + return fmt.Errorf("proto: ParamsResponse: wiretype end group for non-group") + } + if fieldNum <= 0 { + return fmt.Errorf("proto: ParamsResponse: illegal tag %d (wire type %d)", fieldNum, wire) + } + switch fieldNum { + case 1: + if wireType != 0 { + return fmt.Errorf("proto: wrong wireType = %d for field Height", wireType) + } + m.Height = 0 + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowTypes + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := dAtA[iNdEx] + iNdEx++ + m.Height |= uint64(b&0x7F) << shift + if b < 0x80 { + break + } + } + case 2: + if wireType != 2 { + return fmt.Errorf("proto: wrong wireType = %d for field ConsensusParams", wireType) + } + var msglen int + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowTypes + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := dAtA[iNdEx] + iNdEx++ + msglen |= int(b&0x7F) << shift + if b < 0x80 { + break + } + } + if msglen < 0 { + return ErrInvalidLengthTypes + } + postIndex := iNdEx + msglen + if postIndex < 0 { + return ErrInvalidLengthTypes + } + if postIndex > l { + return io.ErrUnexpectedEOF + } + if err := m.ConsensusParams.Unmarshal(dAtA[iNdEx:postIndex]); err != nil { + return err + } + iNdEx = postIndex + default: + iNdEx = preIndex + skippy, err := skipTypes(dAtA[iNdEx:]) + if err != nil { + return err + } + if (skippy < 0) || (iNdEx+skippy) < 0 { + return ErrInvalidLengthTypes + } + if (iNdEx + skippy) > l { + return io.ErrUnexpectedEOF + } + iNdEx += skippy + } + } + + if iNdEx > l { + return io.ErrUnexpectedEOF + } + return nil +} func skipTypes(dAtA []byte) (n int, err error) { l := len(dAtA) iNdEx := 0 diff --git a/proto/tendermint/statesync/types.proto b/proto/tendermint/statesync/types.proto index a4dd8e693..fcfd05f68 100644 --- a/proto/tendermint/statesync/types.proto +++ b/proto/tendermint/statesync/types.proto @@ -1,7 +1,9 @@ syntax = "proto3"; package tendermint.statesync; +import "gogoproto/gogo.proto"; import "tendermint/types/types.proto"; +import "tendermint/types/params.proto"; option go_package = "github.com/tendermint/tendermint/proto/tendermint/statesync"; @@ -13,6 +15,8 @@ message Message { ChunkResponse chunk_response = 4; LightBlockRequest light_block_request = 5; LightBlockResponse light_block_response = 6; + ParamsRequest params_request = 7; + ParamsResponse params_response = 8; } } @@ -46,4 +50,13 @@ message LightBlockRequest { message LightBlockResponse { tendermint.types.LightBlock light_block = 1; +} + +message ParamsRequest { + uint64 height = 1; +} + +message ParamsResponse { + uint64 height = 1; + tendermint.types.ConsensusParams consensus_params = 2 [(gogoproto.nullable) = false]; } \ No newline at end of file diff --git a/rpc/client/evidence_test.go b/rpc/client/evidence_test.go index 0ff158e56..5626b7f48 100644 --- a/rpc/client/evidence_test.go +++ b/rpc/client/evidence_test.go @@ -29,7 +29,7 @@ var defaultTestTime = time.Date(2018, 10, 10, 8, 20, 13, 695936996, time.UTC) func newEvidence(t *testing.T, val *privval.FilePV, vote *types.Vote, vote2 *types.Vote, chainID string) *types.DuplicateVoteEvidence { - + t.Helper() var err error v := vote.ToProto() @@ -44,7 +44,9 @@ func newEvidence(t *testing.T, val *privval.FilePV, validator := types.NewValidator(val.Key.PubKey, 10) valSet := types.NewValidatorSet([]*types.Validator{validator}) - return types.NewDuplicateVoteEvidence(vote, vote2, defaultTestTime, valSet) + ev, err := types.NewDuplicateVoteEvidence(vote, vote2, defaultTestTime, valSet) + require.NoError(t, err) + return ev } func makeEvidences( diff --git a/rpc/client/http/http.go b/rpc/client/http/http.go index 54c56f99f..26a0ea5de 100644 --- a/rpc/client/http/http.go +++ b/rpc/client/http/http.go @@ -419,7 +419,7 @@ func (c *baseRPCClient) Block(ctx context.Context, height *int64) (*ctypes.Resul return result, nil } -func (c *baseRPCClient) BlockByHash(ctx context.Context, hash []byte) (*ctypes.ResultBlock, error) { +func (c *baseRPCClient) BlockByHash(ctx context.Context, hash bytes.HexBytes) (*ctypes.ResultBlock, error) { result := new(ctypes.ResultBlock) params := map[string]interface{}{ "hash": hash, @@ -460,7 +460,7 @@ func (c *baseRPCClient) Commit(ctx context.Context, height *int64) (*ctypes.Resu return result, nil } -func (c *baseRPCClient) Tx(ctx context.Context, hash []byte, prove bool) (*ctypes.ResultTx, error) { +func (c *baseRPCClient) Tx(ctx context.Context, hash bytes.HexBytes, prove bool) (*ctypes.ResultTx, error) { result := new(ctypes.ResultTx) params := map[string]interface{}{ "hash": hash, diff --git a/rpc/client/interface.go b/rpc/client/interface.go index 3547b42ae..8244e9295 100644 --- a/rpc/client/interface.go +++ b/rpc/client/interface.go @@ -67,11 +67,11 @@ type ABCIClient interface { // and prove anything about the chain. type SignClient interface { Block(ctx context.Context, height *int64) (*ctypes.ResultBlock, error) - BlockByHash(ctx context.Context, hash []byte) (*ctypes.ResultBlock, error) + BlockByHash(ctx context.Context, hash bytes.HexBytes) (*ctypes.ResultBlock, error) BlockResults(ctx context.Context, height *int64) (*ctypes.ResultBlockResults, error) Commit(ctx context.Context, height *int64) (*ctypes.ResultCommit, error) Validators(ctx context.Context, height *int64, page, perPage *int) (*ctypes.ResultValidators, error) - Tx(ctx context.Context, hash []byte, prove bool) (*ctypes.ResultTx, error) + Tx(ctx context.Context, hash bytes.HexBytes, prove bool) (*ctypes.ResultTx, error) // TxSearch defines a method to search for a paginated set of transactions by // DeliverTx event search criteria. diff --git a/rpc/client/local/local.go b/rpc/client/local/local.go index d752e6a93..39c4295ac 100644 --- a/rpc/client/local/local.go +++ b/rpc/client/local/local.go @@ -166,7 +166,7 @@ func (c *Local) Block(ctx context.Context, height *int64) (*ctypes.ResultBlock, return c.env.Block(c.ctx, height) } -func (c *Local) BlockByHash(ctx context.Context, hash []byte) (*ctypes.ResultBlock, error) { +func (c *Local) BlockByHash(ctx context.Context, hash bytes.HexBytes) (*ctypes.ResultBlock, error) { return c.env.BlockByHash(c.ctx, hash) } @@ -182,7 +182,7 @@ func (c *Local) Validators(ctx context.Context, height *int64, page, perPage *in return c.env.Validators(c.ctx, height, page, perPage) } -func (c *Local) Tx(ctx context.Context, hash []byte, prove bool) (*ctypes.ResultTx, error) { +func (c *Local) Tx(ctx context.Context, hash bytes.HexBytes, prove bool) (*ctypes.ResultTx, error) { return c.env.Tx(c.ctx, hash, prove) } diff --git a/rpc/client/mock/client.go b/rpc/client/mock/client.go index 57e96fb09..8ff474dd5 100644 --- a/rpc/client/mock/client.go +++ b/rpc/client/mock/client.go @@ -166,7 +166,7 @@ func (c Client) Block(ctx context.Context, height *int64) (*ctypes.ResultBlock, return c.env.Block(&rpctypes.Context{}, height) } -func (c Client) BlockByHash(ctx context.Context, hash []byte) (*ctypes.ResultBlock, error) { +func (c Client) BlockByHash(ctx context.Context, hash bytes.HexBytes) (*ctypes.ResultBlock, error) { return c.env.BlockByHash(&rpctypes.Context{}, hash) } diff --git a/rpc/client/mocks/client.go b/rpc/client/mocks/client.go index ef374b9a8..8e4c7cbf5 100644 --- a/rpc/client/mocks/client.go +++ b/rpc/client/mocks/client.go @@ -115,11 +115,11 @@ func (_m *Client) Block(ctx context.Context, height *int64) (*coretypes.ResultBl } // BlockByHash provides a mock function with given fields: ctx, hash -func (_m *Client) BlockByHash(ctx context.Context, hash []byte) (*coretypes.ResultBlock, error) { +func (_m *Client) BlockByHash(ctx context.Context, hash bytes.HexBytes) (*coretypes.ResultBlock, error) { ret := _m.Called(ctx, hash) var r0 *coretypes.ResultBlock - if rf, ok := ret.Get(0).(func(context.Context, []byte) *coretypes.ResultBlock); ok { + if rf, ok := ret.Get(0).(func(context.Context, bytes.HexBytes) *coretypes.ResultBlock); ok { r0 = rf(ctx, hash) } else { if ret.Get(0) != nil { @@ -128,7 +128,7 @@ func (_m *Client) BlockByHash(ctx context.Context, hash []byte) (*coretypes.Resu } var r1 error - if rf, ok := ret.Get(1).(func(context.Context, []byte) error); ok { + if rf, ok := ret.Get(1).(func(context.Context, bytes.HexBytes) error); ok { r1 = rf(ctx, hash) } else { r1 = ret.Error(1) @@ -706,11 +706,11 @@ func (_m *Client) Subscribe(ctx context.Context, subscriber string, query string } // Tx provides a mock function with given fields: ctx, hash, prove -func (_m *Client) Tx(ctx context.Context, hash []byte, prove bool) (*coretypes.ResultTx, error) { +func (_m *Client) Tx(ctx context.Context, hash bytes.HexBytes, prove bool) (*coretypes.ResultTx, error) { ret := _m.Called(ctx, hash, prove) var r0 *coretypes.ResultTx - if rf, ok := ret.Get(0).(func(context.Context, []byte, bool) *coretypes.ResultTx); ok { + if rf, ok := ret.Get(0).(func(context.Context, bytes.HexBytes, bool) *coretypes.ResultTx); ok { r0 = rf(ctx, hash, prove) } else { if ret.Get(0) != nil { @@ -719,7 +719,7 @@ func (_m *Client) Tx(ctx context.Context, hash []byte, prove bool) (*coretypes.R } var r1 error - if rf, ok := ret.Get(1).(func(context.Context, []byte, bool) error); ok { + if rf, ok := ret.Get(1).(func(context.Context, bytes.HexBytes, bool) error); ok { r1 = rf(ctx, hash, prove) } else { r1 = ret.Error(1) diff --git a/rpc/core/blocks.go b/rpc/core/blocks.go index 081276d0f..78b567583 100644 --- a/rpc/core/blocks.go +++ b/rpc/core/blocks.go @@ -4,6 +4,7 @@ import ( "fmt" "sort" + "github.com/tendermint/tendermint/libs/bytes" tmmath "github.com/tendermint/tendermint/libs/math" tmquery "github.com/tendermint/tendermint/libs/pubsub/query" ctypes "github.com/tendermint/tendermint/rpc/core/types" @@ -107,7 +108,11 @@ func (env *Environment) Block(ctx *rpctypes.Context, heightPtr *int64) (*ctypes. // BlockByHash gets block by hash. // More: https://docs.tendermint.com/master/rpc/#/Info/block_by_hash -func (env *Environment) BlockByHash(ctx *rpctypes.Context, hash []byte) (*ctypes.ResultBlock, error) { +func (env *Environment) BlockByHash(ctx *rpctypes.Context, hash bytes.HexBytes) (*ctypes.ResultBlock, error) { + // N.B. The hash parameter is HexBytes so that the reflective parameter + // decoding logic in the HTTP service will correctly translate from JSON. + // See https://github.com/tendermint/tendermint/issues/6802 for context. + block := env.BlockStore.LoadBlockByHash(hash) if block == nil { return &ctypes.ResultBlock{BlockID: types.BlockID{}, Block: nil}, nil diff --git a/rpc/core/tx.go b/rpc/core/tx.go index 1b3da3075..eb6c73858 100644 --- a/rpc/core/tx.go +++ b/rpc/core/tx.go @@ -5,6 +5,7 @@ import ( "fmt" "sort" + "github.com/tendermint/tendermint/libs/bytes" tmmath "github.com/tendermint/tendermint/libs/math" tmquery "github.com/tendermint/tendermint/libs/pubsub/query" ctypes "github.com/tendermint/tendermint/rpc/core/types" @@ -17,9 +18,13 @@ import ( // transaction is in the mempool, invalidated, or was not sent in the first // place. // More: https://docs.tendermint.com/master/rpc/#/Info/tx -func (env *Environment) Tx(ctx *rpctypes.Context, hash []byte, prove bool) (*ctypes.ResultTx, error) { +func (env *Environment) Tx(ctx *rpctypes.Context, hash bytes.HexBytes, prove bool) (*ctypes.ResultTx, error) { // if index is disabled, return error + // N.B. The hash parameter is HexBytes so that the reflective parameter + // decoding logic in the HTTP service will correctly translate from JSON. + // See https://github.com/tendermint/tendermint/issues/6802 for context. + if !indexer.KVSinkEnabled(env.EventSinks) { return nil, errors.New("transaction querying is disabled due to no kvEventSink") } diff --git a/rpc/openapi/openapi.yaml b/rpc/openapi/openapi.yaml index 535320b3f..d5a9ffafa 100644 --- a/rpc/openapi/openapi.yaml +++ b/rpc/openapi/openapi.yaml @@ -601,6 +601,32 @@ paths: application/json: schema: $ref: "#/components/schemas/ErrorResponse" + /unsafe_flush_mempool: + get: + summary: Flush mempool of all unconfirmed transactions + operationId: unsafe_flush_mempool + tags: + - Unsafe + description: | + Flush flushes out the mempool. It acquires a read-lock, fetches all the + transactions currently in the transaction store and removes each transaction + from the store and all indexes and finally resets the cache. + + Note, flushing the mempool may leave the mempool in an inconsistent state. + responses: + "200": + description: empty answer + content: + application/json: + schema: + $ref: "#/components/schemas/EmptyResponse" + "500": + description: empty error + content: + application/json: + schema: + $ref: "#/components/schemas/ErrorResponse" + /blockchain: get: summary: "Get block headers (max: 20) for minHeight <= height <= maxHeight." diff --git a/test/e2e/Makefile b/test/e2e/Makefile index 38ce809e6..2b41cc1cd 100644 --- a/test/e2e/Makefile +++ b/test/e2e/Makefile @@ -1,4 +1,4 @@ -all: docker generator runner +all: docker generator runner tests docker: docker build --tag tendermint/e2e-node -f docker/Dockerfile ../.. @@ -15,4 +15,7 @@ generator: runner: go build -o build/runner ./runner -.PHONY: all app docker generator runner +tests: + go test -o build/tests ./tests + +.PHONY: all app docker generator runner tests diff --git a/test/e2e/generator/generate.go b/test/e2e/generator/generate.go index 28732967f..2d6945e65 100644 --- a/test/e2e/generator/generate.go +++ b/test/e2e/generator/generate.go @@ -48,10 +48,10 @@ var ( // FIXME: v2 disabled due to flake nodeBlockSyncs = uniformChoice{"v0"} // "v2" nodeMempools = uniformChoice{"v0", "v1"} - nodeStateSyncs = uniformChoice{false, true} + nodeStateSyncs = uniformChoice{e2e.StateSyncDisabled, e2e.StateSyncP2P, e2e.StateSyncRPC} nodePersistIntervals = uniformChoice{0, 1, 5} nodeSnapshotIntervals = uniformChoice{0, 3} - nodeRetainBlocks = uniformChoice{0, int(e2e.EvidenceAgeHeight), int(e2e.EvidenceAgeHeight) + 5} + nodeRetainBlocks = uniformChoice{0, 2 * int(e2e.EvidenceAgeHeight), 4 * int(e2e.EvidenceAgeHeight)} nodePerturbations = probSetChoice{ "disconnect": 0.1, "pause": 0.1, @@ -87,11 +87,19 @@ func Generate(r *rand.Rand, opts Options) ([]e2e.Manifest, error) { } manifests = append(manifests, manifest) } + + if opts.Sorted { + // When the sorted flag is set (generally, as long as + // groups aren't set), + e2e.SortManifests(manifests) + } + return manifests, nil } type Options struct { - P2P P2PMode + P2P P2PMode + Sorted bool } type P2PMode string @@ -119,18 +127,11 @@ func generateTestnet(r *rand.Rand, opt map[string]interface{}) (e2e.Manifest, er TxSize: int64(txSize.Choose(r).(int)), } - var p2pNodeFactor int - - switch opt["p2p"].(P2PMode) { - case NewP2PMode: - manifest.UseLegacyP2P = true - case LegacyP2PMode: - manifest.UseLegacyP2P = false - case HybridP2PMode: - manifest.UseLegacyP2P = true - p2pNodeFactor = 2 + p2pMode := opt["p2p"].(P2PMode) + switch p2pMode { + case NewP2PMode, LegacyP2PMode, HybridP2PMode: default: - return manifest, fmt.Errorf("unknown p2p mode %s", opt["p2p"]) + return manifest, fmt.Errorf("unknown p2p mode %s", p2pMode) } var numSeeds, numValidators, numFulls, numLightClients int @@ -153,10 +154,11 @@ func generateTestnet(r *rand.Rand, opt map[string]interface{}) (e2e.Manifest, er for i := 1; i <= numSeeds; i++ { node := generateNode(r, e2e.ModeSeed, 0, manifest.InitialHeight, false) - if p2pNodeFactor == 0 { - node.UseLegacyP2P = manifest.UseLegacyP2P - } else if p2pNodeFactor%i == 0 { - node.UseLegacyP2P = !manifest.UseLegacyP2P + switch p2pMode { + case LegacyP2PMode: + node.UseLegacyP2P = true + case HybridP2PMode: + node.UseLegacyP2P = r.Intn(2) == 1 } manifest.Nodes[fmt.Sprintf("seed%02d", i)] = node @@ -177,10 +179,11 @@ func generateTestnet(r *rand.Rand, opt map[string]interface{}) (e2e.Manifest, er node := generateNode( r, e2e.ModeValidator, startAt, manifest.InitialHeight, i <= 2) - if p2pNodeFactor == 0 { - node.UseLegacyP2P = manifest.UseLegacyP2P - } else if p2pNodeFactor%i == 0 { - node.UseLegacyP2P = !manifest.UseLegacyP2P + switch p2pMode { + case LegacyP2PMode: + node.UseLegacyP2P = true + case HybridP2PMode: + node.UseLegacyP2P = r.Intn(2) == 1 } manifest.Nodes[name] = node @@ -213,11 +216,13 @@ func generateTestnet(r *rand.Rand, opt map[string]interface{}) (e2e.Manifest, er } node := generateNode(r, e2e.ModeFull, startAt, manifest.InitialHeight, false) - if p2pNodeFactor == 0 { - node.UseLegacyP2P = manifest.UseLegacyP2P - } else if p2pNodeFactor%i == 0 { - node.UseLegacyP2P = !manifest.UseLegacyP2P + switch p2pMode { + case LegacyP2PMode: + node.UseLegacyP2P = true + case HybridP2PMode: + node.UseLegacyP2P = r.Intn(2) == 1 } + manifest.Nodes[fmt.Sprintf("full%02d", i)] = node } @@ -291,13 +296,17 @@ func generateNode( PrivvalProtocol: nodePrivvalProtocols.Choose(r), BlockSync: nodeBlockSyncs.Choose(r).(string), Mempool: nodeMempools.Choose(r).(string), - StateSync: nodeStateSyncs.Choose(r).(bool) && startAt > 0, + StateSync: e2e.StateSyncDisabled, PersistInterval: ptrUint64(uint64(nodePersistIntervals.Choose(r).(int))), SnapshotInterval: uint64(nodeSnapshotIntervals.Choose(r).(int)), RetainBlocks: uint64(nodeRetainBlocks.Choose(r).(int)), Perturb: nodePerturbations.Choose(r), } + if startAt > 0 { + node.StateSync = nodeStateSyncs.Choose(r).(string) + } + // If this node is forced to be an archive node, retain all blocks and // enable state sync snapshotting. if forceArchive { @@ -326,7 +335,7 @@ func generateNode( } } - if node.StateSync { + if node.StateSync != e2e.StateSyncDisabled { node.BlockSync = "v0" } diff --git a/test/e2e/generator/main.go b/test/e2e/generator/main.go index f353241fc..7dd096760 100644 --- a/test/e2e/generator/main.go +++ b/test/e2e/generator/main.go @@ -57,6 +57,10 @@ func NewCLI() *CLI { return fmt.Errorf("p2p mode must be either new, legacy, hybrid or mixed got %s", p2pMode) } + if groups == 0 { + opts.Sorted = true + } + return cli.generate(dir, groups, opts) }, } diff --git a/test/e2e/networks/ci.toml b/test/e2e/networks/ci.toml index 00c73ccbd..7e07febd5 100644 --- a/test/e2e/networks/ci.toml +++ b/test/e2e/networks/ci.toml @@ -43,6 +43,7 @@ persist_interval = 0 perturb = ["restart"] privval_protocol = "tcp" seeds = ["seed01"] +block_sync = "v0" [node.validator03] database = "badgerdb" @@ -51,18 +52,22 @@ abci_protocol = "grpc" persist_interval = 3 perturb = ["kill"] privval_protocol = "grpc" -retain_blocks = 7 +block_sync = "v0" +retain_blocks = 10 [node.validator04] abci_protocol = "builtin" +snapshot_interval = 5 database = "rocksdb" persistent_peers = ["validator01"] perturb = ["pause"] +block_sync = "v0" [node.validator05] -database = "cleveldb" -block_sync = "v0" -seeds = ["seed01"] +database = "cleveldb" +block_sync = "v0" +state_sync = "p2p" +seeds = ["seed01"] start_at = 1005 # Becomes part of the validator set at 1010 abci_protocol = "grpc" perturb = ["pause", "disconnect", "restart"] @@ -71,12 +76,11 @@ privval_protocol = "tcp" [node.full01] mode = "full" start_at = 1010 -# FIXME: should be v2, disabled due to flake block_sync = "v0" -persistent_peers = ["validator01", "validator02", "validator03", "validator04", "validator05"] +persistent_peers = ["validator01", "validator02", "validator03", "validator04"] perturb = ["restart"] -retain_blocks = 7 -state_sync = true +retain_blocks = 10 +state_sync = "rpc" [node.light01] mode = "light" diff --git a/test/e2e/pkg/manifest.go b/test/e2e/pkg/manifest.go index 1b0fc8753..2a8f73127 100644 --- a/test/e2e/pkg/manifest.go +++ b/test/e2e/pkg/manifest.go @@ -3,6 +3,7 @@ package e2e import ( "fmt" "os" + "sort" "github.com/BurntSushi/toml" ) @@ -59,9 +60,6 @@ type Manifest struct { // by individual nodes. LogLevel string `toml:"log_level"` - // UseLegacyP2P uses the legacy p2p layer for all nodes in a test. - UseLegacyP2P bool `toml:"use_legacy_p2p"` - // QueueType describes the type of queue that the system uses internally QueueType string `toml:"queue_type"` @@ -117,7 +115,8 @@ type ManifestNode struct { // block hashes and RPC servers. At least one node in the network must have // SnapshotInterval set to non-zero, and the state syncing node must have // StartAt set to an appropriate height where a snapshot is available. - StateSync bool `toml:"state_sync"` + // StateSync can either be "p2p" or "rpc" or an empty string to disable + StateSync string `toml:"state_sync"` // PersistInterval specifies the height interval at which the application // will persist state to disk. Defaults to 1 (every height), setting this to @@ -169,3 +168,43 @@ func LoadManifest(file string) (Manifest, error) { } return manifest, nil } + +// SortManifests orders (in-place) a list of manifests such that the +// manifests will be ordered (vaguely) from least complex to most +// complex. +func SortManifests(manifests []Manifest) { + sort.SliceStable(manifests, func(i, j int) bool { + left, right := manifests[i], manifests[j] + + if len(left.Nodes) < len(right.Nodes) { + return true + } + + if left.InitialHeight < right.InitialHeight { + return true + } + + if left.TxSize < right.TxSize { + return true + } + + if left.Evidence < right.Evidence { + return true + } + + var ( + leftPerturb int + rightPerturb int + ) + + for _, n := range left.Nodes { + leftPerturb += len(n.Perturb) + } + for _, n := range right.Nodes { + rightPerturb += len(n.Perturb) + } + + return leftPerturb < rightPerturb + + }) +} diff --git a/test/e2e/pkg/testnet.go b/test/e2e/pkg/testnet.go index e51fa859e..b54dd2bf0 100644 --- a/test/e2e/pkg/testnet.go +++ b/test/e2e/pkg/testnet.go @@ -50,6 +50,10 @@ const ( EvidenceAgeHeight int64 = 7 EvidenceAgeTime time.Duration = 500 * time.Millisecond + + StateSyncP2P = "p2p" + StateSyncRPC = "rpc" + StateSyncDisabled = "" ) // Testnet represents a single testnet. @@ -81,7 +85,7 @@ type Node struct { StartAt int64 BlockSync string Mempool string - StateSync bool + StateSync string Database string ABCIProtocol Protocol PrivvalProtocol Protocol @@ -94,6 +98,7 @@ type Node struct { LogLevel string UseLegacyP2P bool QueueType string + HasStarted bool } // LoadTestnet loads a testnet from a manifest file, using the filename to @@ -177,7 +182,7 @@ func LoadTestnet(file string) (*Testnet, error) { Perturbations: []Perturbation{}, LogLevel: manifest.LogLevel, QueueType: manifest.QueueType, - UseLegacyP2P: manifest.UseLegacyP2P && nodeManifest.UseLegacyP2P, + UseLegacyP2P: nodeManifest.UseLegacyP2P, } if node.StartAt == testnet.InitialHeight { @@ -333,6 +338,11 @@ func (n Node) Validate(testnet Testnet) error { default: return fmt.Errorf("invalid block sync setting %q", n.BlockSync) } + switch n.StateSync { + case StateSyncDisabled, StateSyncP2P, StateSyncRPC: + default: + return fmt.Errorf("invalid state sync setting %q", n.StateSync) + } switch n.Mempool { case "", "v0", "v1": default: @@ -366,7 +376,7 @@ func (n Node) Validate(testnet Testnet) error { return fmt.Errorf("cannot start at height %v lower than initial height %v", n.StartAt, n.Testnet.InitialHeight) } - if n.StateSync && n.StartAt == 0 { + if n.StateSync != StateSyncDisabled && n.StartAt == 0 { return errors.New("state synced nodes cannot start at the initial height") } if n.RetainBlocks != 0 && n.RetainBlocks < uint64(EvidenceAgeHeight) { diff --git a/test/e2e/run-multiple.sh b/test/e2e/run-multiple.sh index 5d6a20ef9..571a78a7f 100755 --- a/test/e2e/run-multiple.sh +++ b/test/e2e/run-multiple.sh @@ -19,7 +19,7 @@ FAILED=() for MANIFEST in "$@"; do START=$SECONDS - echo "==> Running testnet $MANIFEST..." + echo "==> Running testnet: $MANIFEST" if ! ./build/runner -f "$MANIFEST"; then echo "==> Testnet $MANIFEST failed, dumping manifest..." diff --git a/test/e2e/runner/benchmark.go b/test/e2e/runner/benchmark.go index 74d2491f5..50a2c33f9 100644 --- a/test/e2e/runner/benchmark.go +++ b/test/e2e/runner/benchmark.go @@ -21,8 +21,8 @@ import ( // // Metrics are based of the `benchmarkLength`, the amount of consecutive blocks // sampled from in the testnet -func Benchmark(testnet *e2e.Testnet, benchmarkLength int64) error { - block, _, err := waitForHeight(testnet, 0) +func Benchmark(ctx context.Context, testnet *e2e.Testnet, benchmarkLength int64) error { + block, err := getLatestBlock(ctx, testnet) if err != nil { return err } @@ -32,13 +32,15 @@ func Benchmark(testnet *e2e.Testnet, benchmarkLength int64) error { // wait for the length of the benchmark period in blocks to pass. We allow 5 seconds for each block // which should be sufficient. waitingTime := time.Duration(benchmarkLength*5) * time.Second - endHeight, err := waitForAllNodes(testnet, block.Height+benchmarkLength, waitingTime) + ctx, cancel := context.WithTimeout(ctx, waitingTime) + defer cancel() + block, _, err = waitForHeight(ctx, testnet, block.Height+benchmarkLength) if err != nil { return err } dur := time.Since(startAt) - logger.Info("Ending benchmark period", "height", endHeight) + logger.Info("Ending benchmark period", "height", block.Height) // fetch a sample of blocks blocks, err := fetchBlockChainSample(testnet, benchmarkLength) diff --git a/test/e2e/runner/evidence.go b/test/e2e/runner/evidence.go index 35646ccdb..ab993a2fe 100644 --- a/test/e2e/runner/evidence.go +++ b/test/e2e/runner/evidence.go @@ -28,7 +28,7 @@ const lightClientEvidenceRatio = 4 // evidence and broadcasts it to a random node through the rpc endpoint `/broadcast_evidence`. // Evidence is random and can be a mixture of LightClientAttackEvidence and // DuplicateVoteEvidence. -func InjectEvidence(testnet *e2e.Testnet, amount int) error { +func InjectEvidence(ctx context.Context, testnet *e2e.Testnet, amount int) error { // select a random node var targetNode *e2e.Node @@ -79,9 +79,12 @@ func InjectEvidence(testnet *e2e.Testnet, amount int) error { return err } + wctx, cancel := context.WithTimeout(ctx, time.Minute) + defer cancel() + // wait for the node to reach the height above the forged height so that // it is able to validate the evidence - _, err = waitForNode(targetNode, waitHeight, 30*time.Second) + _, err = waitForNode(wctx, targetNode, waitHeight) if err != nil { return err } @@ -107,9 +110,12 @@ func InjectEvidence(testnet *e2e.Testnet, amount int) error { } } + wctx, cancel = context.WithTimeout(ctx, 30*time.Second) + defer cancel() + // wait for the node to reach the height above the forged height so that // it is able to validate the evidence - _, err = waitForNode(targetNode, blockRes.Block.Height+2, 10*time.Second) + _, err = waitForNode(wctx, targetNode, blockRes.Block.Height+2) if err != nil { return err } @@ -197,10 +203,10 @@ func generateDuplicateVoteEvidence( chainID string, time time.Time, ) (*types.DuplicateVoteEvidence, error) { - // nolint:gosec // G404: Use of weak random number generator - privVal := privVals[rand.Intn(len(privVals))] - - valIdx, _ := vals.GetByAddress(privVal.PrivKey.PubKey().Address()) + privVal, valIdx, err := getRandomValidatorIndex(privVals, vals) + if err != nil { + return nil, err + } voteA, err := factory.MakeVote(privVal, chainID, valIdx, height, 0, 2, makeRandomBlockID(), time) if err != nil { return nil, err @@ -209,14 +215,27 @@ func generateDuplicateVoteEvidence( if err != nil { return nil, err } - ev := types.NewDuplicateVoteEvidence(voteA, voteB, time, vals) - if ev == nil { - return nil, fmt.Errorf("could not generate evidence a=%v b=%v vals=%v", voteA, voteB, vals) + ev, err := types.NewDuplicateVoteEvidence(voteA, voteB, time, vals) + if err != nil { + return nil, fmt.Errorf("could not generate evidence: %w", err) } return ev, nil } +// getRandomValidatorIndex picks a random validator from a slice of mock PrivVals that's +// also part of the validator set, returning the PrivVal and its index in the validator set +func getRandomValidatorIndex(privVals []types.MockPV, vals *types.ValidatorSet) (types.MockPV, int32, error) { + for _, idx := range rand.Perm(len(privVals)) { + pv := privVals[idx] + valIdx, _ := vals.GetByAddress(pv.PrivKey.PubKey().Address()) + if valIdx >= 0 { + return pv, valIdx, nil + } + } + return types.MockPV{}, -1, errors.New("no private validator found in validator set") +} + func readPrivKey(keyFilePath string) (crypto.PrivKey, error) { keyJSONBytes, err := ioutil.ReadFile(keyFilePath) if err != nil { diff --git a/test/e2e/runner/load.go b/test/e2e/runner/load.go index b57c96ddf..b16df5ed2 100644 --- a/test/e2e/runner/load.go +++ b/test/e2e/runner/load.go @@ -3,10 +3,9 @@ package main import ( "container/ring" "context" - "crypto/rand" "errors" "fmt" - "math" + "math/rand" "time" rpchttp "github.com/tendermint/tendermint/rpc/client/http" @@ -15,9 +14,8 @@ import ( ) // Load generates transactions against the network until the given context is -// canceled. A multiplier of greater than one can be supplied if load needs to -// be generated beyond a minimum amount. -func Load(ctx context.Context, testnet *e2e.Testnet, multiplier int) error { +// canceled. +func Load(ctx context.Context, testnet *e2e.Testnet) error { // Since transactions are executed across all nodes in the network, we need // to reduce transaction load for larger networks to avoid using too much // CPU. This gives high-throughput small networks and low-throughput large ones. @@ -27,11 +25,9 @@ func Load(ctx context.Context, testnet *e2e.Testnet, multiplier int) error { if concurrency == 0 { concurrency = 1 } - initialTimeout := 1 * time.Minute - stallTimeout := 30 * time.Second chTx := make(chan types.Tx) - chSuccess := make(chan types.Tx) + chSuccess := make(chan int) // success counts per iteration ctx, cancel := context.WithCancel(ctx) defer cancel() @@ -39,61 +35,99 @@ func Load(ctx context.Context, testnet *e2e.Testnet, multiplier int) error { logger.Info(fmt.Sprintf("Starting transaction load (%v workers)...", concurrency)) started := time.Now() - go loadGenerate(ctx, chTx, multiplier, testnet.TxSize) + go loadGenerate(ctx, chTx, testnet.TxSize) for w := 0; w < concurrency; w++ { go loadProcess(ctx, testnet, chTx, chSuccess) } - // Monitor successful transactions, and abort on stalls. + // Montior transaction to ensure load propagates to the network + // + // This loop doesn't check or time out for stalls, since a stall here just + // aborts the load generator sooner and could obscure backpressure + // from the test harness, and there are other checks for + // stalls in the framework. Ideally we should monitor latency as a guide + // for when to give up, but we don't have a good way to track that yet. success := 0 - timeout := initialTimeout for { select { - case <-chSuccess: - success++ - timeout = stallTimeout - case <-time.After(timeout): - return fmt.Errorf("unable to submit transactions for %v", timeout) + case numSeen := <-chSuccess: + success += numSeen case <-ctx.Done(): if success == 0 { return errors.New("failed to submit any transactions") } - logger.Info(fmt.Sprintf("Ending transaction load after %v txs (%.1f tx/s)...", - success, float64(success)/time.Since(started).Seconds())) + rate := float64(success) / time.Since(started).Seconds() + + logger.Info("ending transaction load", + "dur_secs", time.Since(started).Seconds(), + "txns", success, + "rate", rate, + "slow", rate < 1) + return nil } } } -// loadGenerate generates jobs until the context is canceled -func loadGenerate(ctx context.Context, chTx chan<- types.Tx, multiplier int, size int64) { - for i := 0; i < math.MaxInt64; i++ { +// loadGenerate generates jobs until the context is canceled. +// +// The chTx has multiple consumers, thus the rate limiting of the load +// generation is primarily the result of backpressure from the +// broadcast transaction, though there is still some timer-based +// limiting. +func loadGenerate(ctx context.Context, chTx chan<- types.Tx, size int64) { + timer := time.NewTimer(0) + defer timer.Stop() + defer close(chTx) + + for { + select { + case <-ctx.Done(): + return + case <-timer.C: + } + // We keep generating the same 100 keys over and over, with different values. // This gives a reasonable load without putting too much data in the app. - id := i % 100 + id := rand.Int63() % 100 // nolint: gosec bz := make([]byte, size) - _, err := rand.Read(bz) + _, err := rand.Read(bz) // nolint: gosec if err != nil { panic(fmt.Sprintf("Failed to read random bytes: %v", err)) } tx := types.Tx(fmt.Sprintf("load-%X=%x", id, bz)) select { - case chTx <- tx: - sqrtSize := int(math.Sqrt(float64(size))) - time.Sleep(10 * time.Millisecond * time.Duration(sqrtSize/multiplier)) - case <-ctx.Done(): - close(chTx) return + case chTx <- tx: + // sleep for a bit before sending the + // next transaction. + timer.Reset(loadGenerateWaitTime(size)) } + } } +func loadGenerateWaitTime(size int64) time.Duration { + const ( + min = int64(100 * time.Millisecond) + max = int64(time.Second) + ) + + var ( + baseJitter = rand.Int63n(max-min+1) + min // nolint: gosec + sizeFactor = size * int64(time.Millisecond) + sizeJitter = rand.Int63n(sizeFactor-min+1) + min // nolint: gosec + ) + + return time.Duration(baseJitter + sizeJitter) +} + // loadProcess processes transactions -func loadProcess(ctx context.Context, testnet *e2e.Testnet, chTx <-chan types.Tx, chSuccess chan<- types.Tx) { +func loadProcess(ctx context.Context, testnet *e2e.Testnet, chTx <-chan types.Tx, chSuccess chan<- int) { // Each worker gets its own client to each usable node, which // allows for some concurrency while still bounding it. clients := make([]*rpchttp.HTTP, 0, len(testnet.Nodes)) @@ -127,8 +161,7 @@ func loadProcess(ctx context.Context, testnet *e2e.Testnet, chTx <-chan types.Tx clientRing = clientRing.Next() } - var err error - + successes := 0 for { select { case <-ctx.Done(): @@ -137,19 +170,24 @@ func loadProcess(ctx context.Context, testnet *e2e.Testnet, chTx <-chan types.Tx clientRing = clientRing.Next() client := clientRing.Value.(*rpchttp.HTTP) - if _, err := client.Health(ctx); err != nil { + if status, err := client.Status(ctx); err != nil { + continue + } else if status.SyncInfo.CatchingUp { continue } - if _, err = client.BroadcastTxSync(ctx, tx); err != nil { + if _, err := client.BroadcastTxSync(ctx, tx); err != nil { continue } + successes++ select { - case chSuccess <- tx: + case chSuccess <- successes: + successes = 0 // reset counter for the next iteration continue case <-ctx.Done(): return + default: } } diff --git a/test/e2e/runner/main.go b/test/e2e/runner/main.go index cb3d7d6bc..f65b6d0b1 100644 --- a/test/e2e/runner/main.go +++ b/test/e2e/runner/main.go @@ -57,44 +57,47 @@ func NewCLI() *CLI { } chLoadResult := make(chan error) - ctx, loadCancel := context.WithCancel(context.Background()) + ctx, cancel := context.WithCancel(cmd.Context()) + defer cancel() + + lctx, loadCancel := context.WithCancel(ctx) defer loadCancel() go func() { - err := Load(ctx, cli.testnet, 1) - chLoadResult <- err + chLoadResult <- Load(lctx, cli.testnet) }() - if err := Start(cli.testnet); err != nil { + if err := Start(ctx, cli.testnet); err != nil { return err } - if err := Wait(cli.testnet, 5); err != nil { // allow some txs to go through + if err := Wait(ctx, cli.testnet, 5); err != nil { // allow some txs to go through return err } if cli.testnet.HasPerturbations() { - if err := Perturb(cli.testnet); err != nil { + if err := Perturb(ctx, cli.testnet); err != nil { return err } - if err := Wait(cli.testnet, 5); err != nil { // allow some txs to go through + if err := Wait(ctx, cli.testnet, 5); err != nil { // allow some txs to go through return err } } if cli.testnet.Evidence > 0 { - if err := InjectEvidence(cli.testnet, cli.testnet.Evidence); err != nil { + if err := InjectEvidence(ctx, cli.testnet, cli.testnet.Evidence); err != nil { return err } - if err := Wait(cli.testnet, 5); err != nil { // ensure chain progress + if err := Wait(ctx, cli.testnet, 5); err != nil { // ensure chain progress return err } } loadCancel() + if err := <-chLoadResult; err != nil { return fmt.Errorf("transaction load failed: %w", err) } - if err := Wait(cli.testnet, 5); err != nil { // wait for network to settle before tests + if err := Wait(ctx, cli.testnet, 5); err != nil { // wait for network to settle before tests return err } if err := Test(cli.testnet); err != nil { @@ -139,7 +142,7 @@ func NewCLI() *CLI { if err != nil { return err } - return Start(cli.testnet) + return Start(cmd.Context(), cli.testnet) }, }) @@ -147,7 +150,7 @@ func NewCLI() *CLI { Use: "perturb", Short: "Perturbs the Docker testnet, e.g. by restarting or disconnecting nodes", RunE: func(cmd *cobra.Command, args []string) error { - return Perturb(cli.testnet) + return Perturb(cmd.Context(), cli.testnet) }, }) @@ -155,7 +158,7 @@ func NewCLI() *CLI { Use: "wait", Short: "Waits for a few blocks to be produced and all nodes to catch up", RunE: func(cmd *cobra.Command, args []string) error { - return Wait(cli.testnet, 5) + return Wait(cmd.Context(), cli.testnet, 5) }, }) @@ -169,29 +172,28 @@ func NewCLI() *CLI { }) cli.root.AddCommand(&cobra.Command{ - Use: "resume", - Short: "Resumes the Docker testnet", + Use: "pause", + Short: "Pauses the Docker testnet", RunE: func(cmd *cobra.Command, args []string) error { - logger.Info("Resuming testnet") - return execCompose(cli.testnet.Dir, "up") + logger.Info("Pausing testnet") + return execCompose(cli.testnet.Dir, "pause") }, }) cli.root.AddCommand(&cobra.Command{ - Use: "load [multiplier]", - Args: cobra.MaximumNArgs(1), + Use: "resume", + Short: "Resumes the Docker testnet", + RunE: func(cmd *cobra.Command, args []string) error { + logger.Info("Resuming testnet") + return execCompose(cli.testnet.Dir, "unpause") + }, + }) + + cli.root.AddCommand(&cobra.Command{ + Use: "load", Short: "Generates transaction load until the command is canceled", RunE: func(cmd *cobra.Command, args []string) (err error) { - m := 1 - - if len(args) == 1 { - m, err = strconv.Atoi(args[0]) - if err != nil { - return err - } - } - - return Load(context.Background(), cli.testnet, m) + return Load(context.Background(), cli.testnet) }, }) @@ -209,7 +211,7 @@ func NewCLI() *CLI { } } - return InjectEvidence(cli.testnet, amount) + return InjectEvidence(cmd.Context(), cli.testnet, amount) }, }) @@ -272,23 +274,26 @@ Does not run any perbutations. } chLoadResult := make(chan error) - ctx, loadCancel := context.WithCancel(context.Background()) + ctx, cancel := context.WithCancel(cmd.Context()) + defer cancel() + + lctx, loadCancel := context.WithCancel(ctx) defer loadCancel() go func() { - err := Load(ctx, cli.testnet, 1) + err := Load(lctx, cli.testnet) chLoadResult <- err }() - if err := Start(cli.testnet); err != nil { + if err := Start(ctx, cli.testnet); err != nil { return err } - if err := Wait(cli.testnet, 5); err != nil { // allow some txs to go through + if err := Wait(ctx, cli.testnet, 5); err != nil { // allow some txs to go through return err } // we benchmark performance over the next 100 blocks - if err := Benchmark(cli.testnet, 100); err != nil { + if err := Benchmark(ctx, cli.testnet, 100); err != nil { return err } diff --git a/test/e2e/runner/perturb.go b/test/e2e/runner/perturb.go index 8fb6ec726..900f75d73 100644 --- a/test/e2e/runner/perturb.go +++ b/test/e2e/runner/perturb.go @@ -1,6 +1,7 @@ package main import ( + "context" "fmt" "time" @@ -9,14 +10,24 @@ import ( ) // Perturbs a running testnet. -func Perturb(testnet *e2e.Testnet) error { +func Perturb(ctx context.Context, testnet *e2e.Testnet) error { + timer := time.NewTimer(0) // first tick fires immediately; reset below + defer timer.Stop() + for _, node := range testnet.Nodes { for _, perturbation := range node.Perturbations { - _, err := PerturbNode(node, perturbation) - if err != nil { - return err + select { + case <-ctx.Done(): + return ctx.Err() + case <-timer.C: + _, err := PerturbNode(ctx, node, perturbation) + if err != nil { + return err + } + + // give network some time to recover between each + timer.Reset(20 * time.Second) } - time.Sleep(20 * time.Second) // give network some time to recover between each } } return nil @@ -24,7 +35,7 @@ func Perturb(testnet *e2e.Testnet) error { // PerturbNode perturbs a node with a given perturbation, returning its status // after recovering. -func PerturbNode(node *e2e.Node, perturbation e2e.Perturbation) (*rpctypes.ResultStatus, error) { +func PerturbNode(ctx context.Context, node *e2e.Node, perturbation e2e.Perturbation) (*rpctypes.ResultStatus, error) { testnet := node.Testnet switch perturbation { case e2e.PerturbationDisconnect: @@ -77,7 +88,9 @@ func PerturbNode(node *e2e.Node, perturbation e2e.Perturbation) (*rpctypes.Resul return nil, nil } - status, err := waitForNode(node, 0, 3*time.Minute) + ctx, cancel := context.WithTimeout(ctx, 5*time.Minute) + defer cancel() + status, err := waitForNode(ctx, node, 0) if err != nil { return nil, err } diff --git a/test/e2e/runner/rpc.go b/test/e2e/runner/rpc.go index 52c009caa..ca6b743eb 100644 --- a/test/e2e/runner/rpc.go +++ b/test/e2e/runner/rpc.go @@ -13,60 +13,123 @@ import ( ) // waitForHeight waits for the network to reach a certain height (or above), -// returning the highest height seen. Errors if the network is not making +// returning the block at the height seen. Errors if the network is not making // progress at all. -func waitForHeight(testnet *e2e.Testnet, height int64) (*types.Block, *types.BlockID, error) { +// If height == 0, the initial height of the test network is used as the target. +func waitForHeight(ctx context.Context, testnet *e2e.Testnet, height int64) (*types.Block, *types.BlockID, error) { var ( - err error - maxResult *rpctypes.ResultBlock - clients = map[string]*rpchttp.HTTP{} - lastIncrease = time.Now() + err error + clients = map[string]*rpchttp.HTTP{} + lastHeight int64 + lastIncrease = time.Now() + nodesAtHeight = map[string]struct{}{} + numRunningNodes int ) + if height == 0 { + height = testnet.InitialHeight + } + for _, node := range testnet.Nodes { + if node.Stateless() { + continue + } + + if node.HasStarted { + numRunningNodes++ + } + } + + timer := time.NewTimer(0) + defer timer.Stop() for { - for _, node := range testnet.Nodes { - if node.Mode == e2e.ModeSeed { - continue - } - client, ok := clients[node.Name] - if !ok { - client, err = node.Client() + select { + case <-ctx.Done(): + return nil, nil, ctx.Err() + case <-timer.C: + for _, node := range testnet.Nodes { + // skip nodes that have reached the target height + if _, ok := nodesAtHeight[node.Name]; ok { + continue + } + + // skip nodes that don't have state or haven't started yet + if node.Stateless() { + continue + } + if !node.HasStarted { + continue + } + + // cache the clients + client, ok := clients[node.Name] + if !ok { + client, err = node.Client() + if err != nil { + continue + } + clients[node.Name] = client + } + + wctx, cancel := context.WithTimeout(ctx, 10*time.Second) + defer cancel() + result, err := client.Status(wctx) if err != nil { continue } - clients[node.Name] = client + if result.SyncInfo.LatestBlockHeight > lastHeight { + lastHeight = result.SyncInfo.LatestBlockHeight + lastIncrease = time.Now() + } + + if result.SyncInfo.LatestBlockHeight >= height { + // the node has achieved the target height! + + // add this node to the set of target + // height nodes + nodesAtHeight[node.Name] = struct{}{} + + // if not all of the nodes that we + // have clients for have reached the + // target height, keep trying. + if numRunningNodes > len(nodesAtHeight) { + continue + } + + // All nodes are at or above the target height. Now fetch the block for that target height + // and return it. We loop again through all clients because some may have pruning set but + // at least two of them should be archive nodes. + for _, c := range clients { + result, err := c.Block(ctx, &height) + if err != nil || result == nil || result.Block == nil { + continue + } + return result.Block, &result.BlockID, err + } + } } - ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second) - defer cancel() - result, err := client.Block(ctx, nil) - if err != nil { - continue + if len(clients) == 0 { + return nil, nil, errors.New("unable to connect to any network nodes") } - if result.Block != nil && (maxResult == nil || result.Block.Height > maxResult.Block.Height) { - maxResult = result - lastIncrease = time.Now() - } - if maxResult != nil && maxResult.Block.Height >= height { - return maxResult.Block, &maxResult.BlockID, nil - } - } + if time.Since(lastIncrease) >= time.Minute { + if lastHeight == 0 { + return nil, nil, errors.New("chain stalled at unknown height (most likely upon starting)") + } + + return nil, nil, fmt.Errorf("chain stalled at height %v [%d of %d nodes %+v]", + lastHeight, + len(nodesAtHeight), + numRunningNodes, + nodesAtHeight) - if len(clients) == 0 { - return nil, nil, errors.New("unable to connect to any network nodes") - } - if time.Since(lastIncrease) >= time.Minute { - if maxResult == nil { - return nil, nil, errors.New("chain stalled at unknown height") } - return nil, nil, fmt.Errorf("chain stalled at height %v", maxResult.Block.Height) + timer.Reset(1 * time.Second) } - time.Sleep(1 * time.Second) } } // waitForNode waits for a node to become available and catch up to the given block height. -func waitForNode(node *e2e.Node, height int64, timeout time.Duration) (*rpctypes.ResultStatus, error) { +func waitForNode(ctx context.Context, node *e2e.Node, height int64) (*rpctypes.ResultStatus, error) { if node.Mode == e2e.ModeSeed { return nil, nil } @@ -75,42 +138,91 @@ func waitForNode(node *e2e.Node, height int64, timeout time.Duration) (*rpctypes return nil, err } - ctx, cancel := context.WithTimeout(context.Background(), timeout) - defer cancel() + timer := time.NewTimer(0) + defer timer.Stop() + var ( + lastFailed bool + counter int + ) for { - status, err := client.Status(ctx) - switch { - case errors.Is(err, context.DeadlineExceeded): - return nil, fmt.Errorf("timed out waiting for %v to reach height %v", node.Name, height) - case errors.Is(err, context.Canceled): - return nil, err - case err == nil && status.SyncInfo.LatestBlockHeight >= height: - return status, nil + counter++ + if lastFailed { + lastFailed = false + + // if there was a problem with the request in + // the previous recreate the client to ensure + // reconnection + client, err = node.Client() + if err != nil { + return nil, err + } } - time.Sleep(300 * time.Millisecond) + select { + case <-ctx.Done(): + return nil, ctx.Err() + case <-timer.C: + status, err := client.Status(ctx) + switch { + case errors.Is(err, context.DeadlineExceeded): + return nil, fmt.Errorf("timed out waiting for %v to reach height %v", node.Name, height) + case errors.Is(err, context.Canceled): + return nil, err + case err == nil && status.SyncInfo.LatestBlockHeight >= height: + return status, nil + case counter%100 == 0: + switch { + case err != nil: + lastFailed = true + logger.Error("node not yet ready", + "iter", counter, + "node", node.Name, + "err", err, + "target", height, + ) + case status != nil: + logger.Error("node not yet ready", + "iter", counter, + "node", node.Name, + "height", status.SyncInfo.LatestBlockHeight, + "target", height, + ) + } + } + timer.Reset(250 * time.Millisecond) + } } } -// waitForAllNodes waits for all nodes to become available and catch up to the given block height. -func waitForAllNodes(testnet *e2e.Testnet, height int64, timeout time.Duration) (int64, error) { - var lastHeight int64 - +// getLatestBlock returns the last block that all active nodes in the network have +// agreed upon i.e. the earlist of each nodes latest block +func getLatestBlock(ctx context.Context, testnet *e2e.Testnet) (*types.Block, error) { + var earliestBlock *types.Block for _, node := range testnet.Nodes { - if node.Mode == e2e.ModeSeed { + // skip nodes that don't have state or haven't started yet + if node.Stateless() { + continue + } + if !node.HasStarted { continue } - status, err := waitForNode(node, height, timeout) + client, err := node.Client() if err != nil { - return 0, err + return nil, err } - if status.SyncInfo.LatestBlockHeight > lastHeight { - lastHeight = status.SyncInfo.LatestBlockHeight + wctx, cancel := context.WithTimeout(ctx, 10*time.Second) + defer cancel() + result, err := client.Block(wctx, nil) + if err != nil { + return nil, err + } + + if result.Block != nil && (earliestBlock == nil || earliestBlock.Height > result.Block.Height) { + earliestBlock = result.Block } } - - return lastHeight, nil + return earliestBlock, nil } diff --git a/test/e2e/runner/setup.go b/test/e2e/runner/setup.go index a0bd4996a..3af7a9944 100644 --- a/test/e2e/runner/setup.go +++ b/test/e2e/runner/setup.go @@ -297,15 +297,18 @@ func MakeConfig(node *e2e.Node) (*config.Config, error) { } if node.BlockSync == "" { - cfg.FastSyncMode = false + cfg.BlockSync.Enable = false } else { cfg.BlockSync.Version = node.BlockSync } - if node.StateSync { + switch node.StateSync { + case e2e.StateSyncP2P: + cfg.StateSync.Enable = true + cfg.StateSync.UseP2P = true + case e2e.StateSyncRPC: cfg.StateSync.Enable = true cfg.StateSync.RPCServers = []string{} - for _, peer := range node.Testnet.ArchiveNodes() { if peer.Name == node.Name { continue diff --git a/test/e2e/runner/start.go b/test/e2e/runner/start.go index c8d6163ed..967d2519c 100644 --- a/test/e2e/runner/start.go +++ b/test/e2e/runner/start.go @@ -1,6 +1,7 @@ package main import ( + "context" "fmt" "sort" "time" @@ -8,7 +9,7 @@ import ( e2e "github.com/tendermint/tendermint/test/e2e/pkg" ) -func Start(testnet *e2e.Testnet) error { +func Start(ctx context.Context, testnet *e2e.Testnet) error { if len(testnet.Nodes) == 0 { return fmt.Errorf("no nodes in testnet") } @@ -45,9 +46,17 @@ func Start(testnet *e2e.Testnet) error { if err := execCompose(testnet.Dir, "up", "-d", node.Name); err != nil { return err } - if _, err := waitForNode(node, 0, time.Minute); err != nil { + + if err := func() error { + ctx, cancel := context.WithTimeout(ctx, time.Minute) + defer cancel() + + _, err := waitForNode(ctx, node, 0) + return err + }(); err != nil { return err } + node.HasStarted = true logger.Info(fmt.Sprintf("Node %v up on http://127.0.0.1:%v", node.Name, node.ProxyPort)) } @@ -59,21 +68,11 @@ func Start(testnet *e2e.Testnet) error { "nodes", len(testnet.Nodes)-len(nodeQueue), "pending", len(nodeQueue)) - block, blockID, err := waitForHeight(testnet, networkHeight) + block, blockID, err := waitForHeight(ctx, testnet, networkHeight) if err != nil { return err } - // Update any state sync nodes with a trusted height and hash - for _, node := range nodeQueue { - if node.StateSync || node.Mode == e2e.ModeLight { - err = UpdateConfigStateSync(node, block.Height, blockID.Hash.Bytes()) - if err != nil { - return err - } - } - } - for _, node := range nodeQueue { if node.StartAt > networkHeight { // if we're starting a node that's ahead of @@ -83,26 +82,42 @@ func Start(testnet *e2e.Testnet) error { // that this node will start at before we // start the node. + logger.Info("Waiting for network to advance to height", + "node", node.Name, + "last_height", networkHeight, + "waiting_for", node.StartAt, + "size", len(testnet.Nodes)-len(nodeQueue), + "pending", len(nodeQueue)) + networkHeight = node.StartAt - logger.Info("Waiting for network to advance before starting catch up node", - "node", node.Name, - "height", networkHeight) - - if _, _, err := waitForHeight(testnet, networkHeight); err != nil { + block, blockID, err = waitForHeight(ctx, testnet, networkHeight) + if err != nil { return err } } - logger.Info("Starting catch up node", "node", node.Name, "height", node.StartAt) + // Update any state sync nodes with a trusted height and hash + if node.StateSync != e2e.StateSyncDisabled || node.Mode == e2e.ModeLight { + err = UpdateConfigStateSync(node, block.Height, blockID.Hash.Bytes()) + if err != nil { + return err + } + } if err := execCompose(testnet.Dir, "up", "-d", node.Name); err != nil { return err } - status, err := waitForNode(node, node.StartAt, 8*time.Minute) + + wctx, wcancel := context.WithTimeout(ctx, 8*time.Minute) + status, err := waitForNode(wctx, node, node.StartAt) if err != nil { + wcancel() return err } + wcancel() + + node.HasStarted = true logger.Info(fmt.Sprintf("Node %v up on http://127.0.0.1:%v at height %v", node.Name, node.ProxyPort, status.SyncInfo.LatestBlockHeight)) } diff --git a/test/e2e/runner/test.go b/test/e2e/runner/test.go index 834ce6f2d..ac24b0cd2 100644 --- a/test/e2e/runner/test.go +++ b/test/e2e/runner/test.go @@ -15,5 +15,5 @@ func Test(testnet *e2e.Testnet) error { return err } - return execVerbose("go", "test", "-count", "1", "./tests/...") + return execVerbose("./build/tests", "-test.count", "1") } diff --git a/test/e2e/runner/wait.go b/test/e2e/runner/wait.go index 9f3a4c438..e3f955071 100644 --- a/test/e2e/runner/wait.go +++ b/test/e2e/runner/wait.go @@ -1,31 +1,27 @@ package main import ( + "context" "fmt" - "time" e2e "github.com/tendermint/tendermint/test/e2e/pkg" ) // Wait waits for a number of blocks to be produced, and for all nodes to catch // up with it. -func Wait(testnet *e2e.Testnet, blocks int64) error { - block, _, err := waitForHeight(testnet, 0) +func Wait(ctx context.Context, testnet *e2e.Testnet, blocks int64) error { + block, err := getLatestBlock(ctx, testnet) if err != nil { return err } - return WaitUntil(testnet, block.Height+blocks) + return WaitUntil(ctx, testnet, block.Height+blocks) } // WaitUntil waits until a given height has been reached. -func WaitUntil(testnet *e2e.Testnet, height int64) error { +func WaitUntil(ctx context.Context, testnet *e2e.Testnet, height int64) error { logger.Info(fmt.Sprintf("Waiting for all nodes to reach height %v...", height)) - _, err := waitForAllNodes(testnet, height, waitingTime(len(testnet.Nodes))) + + _, _, err := waitForHeight(ctx, testnet, height) + return err } - -// waitingTime estimates how long it should take for a node to reach the height. -// More nodes in a network implies we may expect a slower network and may have to wait longer. -func waitingTime(nodes int) time.Duration { - return time.Minute + (time.Duration(nodes) * (30 * time.Second)) -} diff --git a/test/e2e/tests/app_test.go b/test/e2e/tests/app_test.go index 08710f168..e89a9bae9 100644 --- a/test/e2e/tests/app_test.go +++ b/test/e2e/tests/app_test.go @@ -2,6 +2,7 @@ package e2e_test import ( "bytes" + "context" "fmt" "math/rand" "testing" @@ -10,6 +11,7 @@ import ( "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" + "github.com/tendermint/tendermint/rpc/client/http" e2e "github.com/tendermint/tendermint/test/e2e/pkg" "github.com/tendermint/tendermint/types" ) @@ -17,9 +19,6 @@ import ( // Tests that any initial state given in genesis has made it into the app. func TestApp_InitialState(t *testing.T) { testNode(t, func(t *testing.T, node e2e.Node) { - if node.Stateless() { - return - } if len(node.Testnet.InitialState) == 0 { return } @@ -39,10 +38,6 @@ func TestApp_InitialState(t *testing.T) { // block and the node sync status. func TestApp_Hash(t *testing.T) { testNode(t, func(t *testing.T, node e2e.Node) { - if node.Mode == e2e.ModeSeed { - return - } - client, err := node.Client() require.NoError(t, err) info, err := client.ABCIInfo(ctx) @@ -51,7 +46,7 @@ func TestApp_Hash(t *testing.T) { block, err := client.Block(ctx, nil) require.NoError(t, err) - require.EqualValues(t, info.Response.LastBlockAppHash, block.Block.AppHash, + require.EqualValues(t, info.Response.LastBlockAppHash, block.Block.AppHash.Bytes(), "app hash does not match last block's app hash") status, err := client.Status(ctx) @@ -63,47 +58,91 @@ func TestApp_Hash(t *testing.T) { // Tests that we can set a value and retrieve it. func TestApp_Tx(t *testing.T) { - testNode(t, func(t *testing.T, node e2e.Node) { - if node.Mode == e2e.ModeSeed { - return + type broadcastFunc func(context.Context, types.Tx) error + + testCases := []struct { + Name string + WaitTime time.Duration + BroadcastTx func(client *http.HTTP) broadcastFunc + ShouldSkip bool + }{ + { + Name: "Sync", + WaitTime: 30 * time.Second, + BroadcastTx: func(client *http.HTTP) broadcastFunc { + return func(ctx context.Context, tx types.Tx) error { + _, err := client.BroadcastTxSync(ctx, tx) + return err + } + }, + }, + { + Name: "Commit", + WaitTime: time.Minute, + BroadcastTx: func(client *http.HTTP) broadcastFunc { + return func(ctx context.Context, tx types.Tx) error { + _, err := client.BroadcastTxCommit(ctx, tx) + return err + } + }, + }, + { + Name: "Async", + WaitTime: time.Minute, + ShouldSkip: true, + BroadcastTx: func(client *http.HTTP) broadcastFunc { + return func(ctx context.Context, tx types.Tx) error { + _, err := client.BroadcastTxAsync(ctx, tx) + return err + } + }, + }, + } + + for idx, test := range testCases { + if test.ShouldSkip { + continue } + t.Run(test.Name, func(t *testing.T) { + // testNode calls t.Parallel as well, so we should + // have a copy of the + test := testCases[idx] + testNode(t, func(t *testing.T, node e2e.Node) { + client, err := node.Client() + require.NoError(t, err) - client, err := node.Client() - require.NoError(t, err) + // Generate a random value, to prevent duplicate tx errors when + // manually running the test multiple times for a testnet. + bz := make([]byte, 32) + _, err = rand.Read(bz) + require.NoError(t, err) - // Generate a random value, to prevent duplicate tx errors when - // manually running the test multiple times for a testnet. - r := rand.New(rand.NewSource(time.Now().UnixNano())) - bz := make([]byte, 32) - _, err = r.Read(bz) - require.NoError(t, err) + key := fmt.Sprintf("testapp-tx-%v", node.Name) + value := fmt.Sprintf("%x", bz) + tx := types.Tx(fmt.Sprintf("%v=%v", key, value)) - key := fmt.Sprintf("testapp-tx-%v", node.Name) - value := fmt.Sprintf("%x", bz) - tx := types.Tx(fmt.Sprintf("%v=%v", key, value)) + require.NoError(t, test.BroadcastTx(client)(ctx, tx)) - _, err = client.BroadcastTxSync(ctx, tx) - require.NoError(t, err) + hash := tx.Hash() - hash := tx.Hash() - waitTime := 20 * time.Second + require.Eventuallyf(t, func() bool { + txResp, err := client.Tx(ctx, hash, false) + return err == nil && bytes.Equal(txResp.Tx, tx) + }, + test.WaitTime, // timeout + time.Second, // interval + "submitted tx %X wasn't committed after %v", + hash, test.WaitTime, + ) - require.Eventuallyf(t, func() bool { - txResp, err := client.Tx(ctx, hash, false) - return err == nil && bytes.Equal(txResp.Tx, tx) - }, waitTime, time.Second, - "submitted tx %X wasn't committed after %v", hash, waitTime, - ) + abciResp, err := client.ABCIQuery(ctx, "", []byte(key)) + require.NoError(t, err) + assert.Equal(t, key, string(abciResp.Response.Key)) + assert.Equal(t, value, string(abciResp.Response.Value)) + }) - // NOTE: we don't test abci query of the light client - if node.Mode == e2e.ModeLight { - return - } + }) - abciResp, err := client.ABCIQuery(ctx, "", []byte(key)) - require.NoError(t, err) - assert.Equal(t, key, string(abciResp.Response.Key)) - assert.Equal(t, value, string(abciResp.Response.Value)) + } - }) } diff --git a/test/e2e/tests/block_test.go b/test/e2e/tests/block_test.go index 21aeeda99..f83cf3757 100644 --- a/test/e2e/tests/block_test.go +++ b/test/e2e/tests/block_test.go @@ -13,10 +13,6 @@ import ( func TestBlock_Header(t *testing.T) { blocks := fetchBlockChain(t) testNode(t, func(t *testing.T, node e2e.Node) { - if node.Mode == e2e.ModeSeed { - return - } - client, err := node.Client() require.NoError(t, err) status, err := client.Status(ctx) @@ -34,7 +30,7 @@ func TestBlock_Header(t *testing.T) { } // the first blocks after state sync come from the backfill process // and are therefore not complete - if node.StateSync && block.Header.Height <= first+e2e.EvidenceAgeHeight+1 { + if node.StateSync != e2e.StateSyncDisabled && block.Header.Height <= first+e2e.EvidenceAgeHeight+1 { continue } if block.Header.Height > last { @@ -55,10 +51,6 @@ func TestBlock_Header(t *testing.T) { // Tests that the node contains the expected block range. func TestBlock_Range(t *testing.T) { testNode(t, func(t *testing.T, node e2e.Node) { - if node.Mode == e2e.ModeSeed { - return - } - client, err := node.Client() require.NoError(t, err) status, err := client.Status(ctx) @@ -70,7 +62,7 @@ func TestBlock_Range(t *testing.T) { switch { // if the node state synced we ignore any assertions because it's hard to know how far back // the node ran reverse sync for - case node.StateSync: + case node.StateSync != e2e.StateSyncDisabled: break case node.RetainBlocks > 0 && int64(node.RetainBlocks) < (last-node.Testnet.InitialHeight+1): // Delta handles race conditions in reading first/last heights. @@ -83,7 +75,7 @@ func TestBlock_Range(t *testing.T) { } for h := first; h <= last; h++ { - if node.StateSync && h <= first+e2e.EvidenceAgeHeight+1 { + if node.StateSync != e2e.StateSyncDisabled && h <= first+e2e.EvidenceAgeHeight+1 { continue } resp, err := client.Block(ctx, &(h)) diff --git a/test/e2e/tests/e2e_test.go b/test/e2e/tests/e2e_test.go index 15c747b5b..acc3ac78a 100644 --- a/test/e2e/tests/e2e_test.go +++ b/test/e2e/tests/e2e_test.go @@ -3,7 +3,6 @@ package e2e_test import ( "context" "os" - "path/filepath" "sync" "testing" @@ -30,8 +29,9 @@ var ( blocksCacheMtx = sync.Mutex{} ) -// testNode runs tests for testnet nodes. The callback function is given a -// single node to test, running as a subtest in parallel with other subtests. +// testNode runs tests for testnet nodes. The callback function is +// given a single stateful node to test, running as a subtest in +// parallel with other subtests. // // The testnet manifest must be given as the envvar E2E_MANIFEST. If not set, // these tests are skipped so that they're not picked up during normal unit @@ -51,6 +51,11 @@ func testNode(t *testing.T, testFunc func(*testing.T, e2e.Node)) { for _, node := range nodes { node := *node + + if node.Stateless() { + continue + } + t.Run(node.Name, func(t *testing.T) { t.Parallel() testFunc(t, node) @@ -66,9 +71,6 @@ func loadTestnet(t *testing.T) e2e.Testnet { if manifest == "" { t.Skip("E2E_MANIFEST not set, not an end-to-end test run") } - if !filepath.IsAbs(manifest) { - manifest = filepath.Join("..", manifest) - } testnetCacheMtx.Lock() defer testnetCacheMtx.Unlock() diff --git a/test/e2e/tests/net_test.go b/test/e2e/tests/net_test.go index 8d331aff9..e6ff27a0e 100644 --- a/test/e2e/tests/net_test.go +++ b/test/e2e/tests/net_test.go @@ -14,11 +14,6 @@ func TestNet_Peers(t *testing.T) { t.SkipNow() testNode(t, func(t *testing.T, node e2e.Node) { - // Seed nodes shouldn't necessarily mesh with the entire network. - if node.Mode == e2e.ModeSeed { - return - } - client, err := node.Client() require.NoError(t, err) netInfo, err := client.NetInfo(ctx) diff --git a/test/e2e/tests/validator_test.go b/test/e2e/tests/validator_test.go index 847a8d388..6e836ff78 100644 --- a/test/e2e/tests/validator_test.go +++ b/test/e2e/tests/validator_test.go @@ -14,10 +14,6 @@ import ( // scheduled validator updates. func TestValidator_Sets(t *testing.T) { testNode(t, func(t *testing.T, node e2e.Node) { - if node.Mode == e2e.ModeSeed { - return - } - client, err := node.Client() require.NoError(t, err) status, err := client.Status(ctx) diff --git a/types/evidence.go b/types/evidence.go index 40ff85e5e..330850ea3 100644 --- a/types/evidence.go +++ b/types/evidence.go @@ -46,15 +46,20 @@ type DuplicateVoteEvidence struct { var _ Evidence = &DuplicateVoteEvidence{} // NewDuplicateVoteEvidence creates DuplicateVoteEvidence with right ordering given -// two conflicting votes. If one of the votes is nil, evidence returned is nil as well -func NewDuplicateVoteEvidence(vote1, vote2 *Vote, blockTime time.Time, valSet *ValidatorSet) *DuplicateVoteEvidence { +// two conflicting votes. If either of the votes is nil, the val set is nil or the voter is +// not in the val set, an error is returned +func NewDuplicateVoteEvidence(vote1, vote2 *Vote, blockTime time.Time, valSet *ValidatorSet, +) (*DuplicateVoteEvidence, error) { var voteA, voteB *Vote - if vote1 == nil || vote2 == nil || valSet == nil { - return nil + if vote1 == nil || vote2 == nil { + return nil, errors.New("missing vote") + } + if valSet == nil { + return nil, errors.New("missing validator set") } idx, val := valSet.GetByAddress(vote1.ValidatorAddress) if idx == -1 { - return nil + return nil, errors.New("validator not in validator set") } if strings.Compare(vote1.BlockID.Key(), vote2.BlockID.Key()) == -1 { @@ -70,7 +75,7 @@ func NewDuplicateVoteEvidence(vote1, vote2 *Vote, blockTime time.Time, valSet *V TotalVotingPower: valSet.TotalVotingPower(), ValidatorPower: val.VotingPower, Timestamp: blockTime, - } + }, nil } // ABCI returns the application relevant representation of the evidence @@ -92,7 +97,7 @@ func (dve *DuplicateVoteEvidence) Bytes() []byte { pbe := dve.ToProto() bz, err := pbe.Marshal() if err != nil { - panic(err) + panic("marshaling duplicate vote evidence to bytes: " + err.Error()) } return bz @@ -260,11 +265,11 @@ func (l *LightClientAttackEvidence) ABCI() []abci.Evidence { func (l *LightClientAttackEvidence) Bytes() []byte { pbe, err := l.ToProto() if err != nil { - panic(err) + panic("converting light client attack evidence to proto: " + err.Error()) } bz, err := pbe.Marshal() if err != nil { - panic(err) + panic("marshaling light client attack evidence to bytes: " + err.Error()) } return bz } @@ -684,7 +689,11 @@ func NewMockDuplicateVoteEvidenceWithValidator(height int64, time time.Time, vB := voteB.ToProto() _ = pv.SignVote(context.Background(), chainID, vB) voteB.Signature = vB.Signature - return NewDuplicateVoteEvidence(voteA, voteB, time, NewValidatorSet([]*Validator{val})) + ev, err := NewDuplicateVoteEvidence(voteA, voteB, time, NewValidatorSet([]*Validator{val})) + if err != nil { + panic("constructing mock duplicate vote evidence: " + err.Error()) + } + return ev } func makeMockVote(height int64, round, index int32, addr Address, diff --git a/types/evidence_test.go b/types/evidence_test.go index 9d54797e4..5110bcb1d 100644 --- a/types/evidence_test.go +++ b/types/evidence_test.go @@ -85,7 +85,8 @@ func TestDuplicateVoteEvidenceValidation(t *testing.T) { vote1 := makeVote(t, val, chainID, math.MaxInt32, math.MaxInt64, math.MaxInt32, 0x02, blockID, defaultVoteTime) vote2 := makeVote(t, val, chainID, math.MaxInt32, math.MaxInt64, math.MaxInt32, 0x02, blockID2, defaultVoteTime) valSet := NewValidatorSet([]*Validator{val.ExtractIntoValidator(10)}) - ev := NewDuplicateVoteEvidence(vote1, vote2, defaultVoteTime, valSet) + ev, err := NewDuplicateVoteEvidence(vote1, vote2, defaultVoteTime, valSet) + require.NoError(t, err) tc.malleateEvidence(ev) assert.Equal(t, tc.expectErr, ev.ValidateBasic() != nil, "Validate Basic had an unexpected result") })