diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index 524df1ef8..0a006f9b9 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -39,17 +39,17 @@ jobs: platforms: all - name: Set up Docker Build - uses: docker/setup-buildx-action@v1.7.0 + uses: docker/setup-buildx-action@v2.0.0 - name: Login to DockerHub if: ${{ github.event_name != 'pull_request' }} - uses: docker/login-action@v1.14.1 + uses: docker/login-action@v2.0.0 with: username: ${{ secrets.DOCKERHUB_USERNAME }} password: ${{ secrets.DOCKERHUB_TOKEN }} - name: Publish to Docker Hub - uses: docker/build-push-action@v2.10.0 + uses: docker/build-push-action@v3.0.0 with: context: . file: ./DOCKER/Dockerfile diff --git a/Makefile b/Makefile index a0b970f76..cd9380768 100644 --- a/Makefile +++ b/Makefile @@ -251,6 +251,21 @@ mockery: go generate -run="./scripts/mockery_generate.sh" ./... .PHONY: mockery +############################################################################### +### Metrics ### +############################################################################### + +metrics: testdata-metrics + go generate -run="scripts/metricsgen" ./... +.PHONY: metrics + + # By convention, the go tool ignores subdirectories of directories named + # 'testdata'. This command invokes the generate command on the folder directly + # to avoid this. +testdata-metrics: + ls ./scripts/metricsgen/testdata | xargs -I{} go generate -run="scripts/metricsgen" ./scripts/metricsgen/testdata/{} +.PHONY: testdata-metrics + ############################################################################### ### Local testnet using docker ### ############################################################################### diff --git a/docs/rfc/README.md b/docs/rfc/README.md index 6b03cf21a..a83d802b8 100644 --- a/docs/rfc/README.md +++ b/docs/rfc/README.md @@ -56,4 +56,7 @@ sections. - [RFC-018: BLS Signature Aggregation Exploration](./rfc-018-bls-agg-exploration.md) - [RFC-019: Configuration File Versioning](./rfc-019-config-version.md) + +- [RFC-017: ABCI++ Vote Extension Propagation](./rfc-017-abci++-vote-extension-propag.md) + diff --git a/docs/rfc/rfc-017-abci++-vote-extension-propag.md b/docs/rfc/rfc-017-abci++-vote-extension-propag.md new file mode 100644 index 000000000..15d08f7ba --- /dev/null +++ b/docs/rfc/rfc-017-abci++-vote-extension-propag.md @@ -0,0 +1,571 @@ +# RFC 017: ABCI++ Vote Extension Propagation + +## Changelog + +- 11-Apr-2022: Initial draft (@sergio-mena). +- 15-Apr-2022: Addressed initial comments. First complete version (@sergio-mena). +- 09-May-2022: Addressed all outstanding comments. + +## Abstract + +According to the +[ABCI++ specification](https://github.com/tendermint/tendermint/blob/4743a7ad0/spec/abci%2B%2B/README.md) +(as of 11-Apr-2022), a validator MUST provide a signed vote extension for each non-`nil` precommit vote +of height *h* that it uses to propose a block in height *h+1*. When a validator is up to +date, this is easy to do, but when a validator needs to catch up this is far from trivial as this data +cannot be retrieved from the blockchain. + +This RFC presents and compares the different options to address this problem, which have been proposed +in several discussions by the Tendermint Core team. + +## Document Structure + +The RFC is structured as follows. In the [Background](#background) section, +subsections [Problem Description](#problem-description) and [Cases to Address](#cases-to-address) +explain the problem at hand from a high level perspective, i.e., abstracting away from the current +Tendermint implementation. In contrast, subsection +[Current Catch-up Mechanisms](#current-catch-up-mechanisms) delves into the details of the current +Tendermint code. + +In the [Discussion](#discussion) section, subsection [Solutions Proposed](#solutions-proposed) is also +worded abstracting away from implementation details, whilst subsections +[Feasibility of the Proposed Solutions](#feasibility-of-the-proposed-solutions) and +[Current Limitations and Possible Implementations](#current-limitations-and-possible-implementations) +analize the viability of one of the proposed solutions in the context of Tendermint's architecture +based on reactors. Finally, [Formalization Work](#formalization-work) briefly discusses the work +still needed demonstrate the correctness of the chosen solution. + +The high level subsections are aimed at readers who are familiar with consensus algorithms, in +particular with the one described in the Tendermint (white paper), but who are not necessarily +acquainted with the details of the Tendermint codebase. The other subsections, which go into +implementation details, are best understood by engineers with deep knowledge of the implementation of +Tendermint's blocksync and consensus reactors. + +## Background + +### Basic Definitions + +This document assumes that all validators have equal voting power for the sake of simplicity. This is done +without loss of generality. + +There are two types of votes in Tendermint: *prevotes* and *precommits*. Votes can be `nil` or refer to +a proposed block. This RFC focuses on precommits, +also known as *precommit votes*. In this document we sometimes call them simply *votes*. + +Validators send precommit votes to their peer nodes in *precommit messages*. According to the +[ABCI++ specification](https://github.com/tendermint/tendermint/blob/4743a7ad0/spec/abci%2B%2B/README.md), +a precommit message MUST also contain a *vote extension*. +This mandatory vote extension can be empty, but MUST be signed with the same key as the precommit +vote (i.e., the sending validator's). +Nevertheless, the vote extension is signed independently from the vote, so a vote can be separated from +its extension. +The reason for vote extensions to be mandatory in precommit messages is that, otherwise, a (malicious) +node can omit a vote extension while still providing/forwarding/sending the corresponding precommit vote. + +The validator set at height *h* is denoted *valseth*. A *commit* for height *h* consists of more +than *2nh/3* precommit votes voting for a block *b*, where *nh* denotes the size of +*valseth*. A commit does not contain `nil` precommit votes, and all votes in it refer to the +same block. An *extended commit* is a *commit* where every precommit vote has its respective vote extension +attached. + +### Problem Description + +In the version of [ABCI](https://github.com/tendermint/spec/blob/4fb99af/spec/abci/README.md) present up to +Tendermint v0.35, for any height *h*, a validator *v* MUST have the decided block *b* and a commit for +height *h* in order to decide at height *h*. Then, *v* just needs a commit for height *h* to propose at +height *h+1*, in the rounds of *h+1* where *v* is a proposer. + +In [ABCI++](https://github.com/tendermint/tendermint/blob/4743a7ad0/spec/abci%2B%2B/README.md), +the information that a validator *v* MUST have to be able to decide in *h* does not change with +respect to pre-existing ABCI: the decided block *b* and a commit for *h*. +In contrast, for proposing in *h+1*, a commit for *h* is not enough: *v* MUST now have an extended +commit. + +When a validator takes an active part in consensus at height *h*, it has all the data it needs in memory, +in its consensus state, to decide on *h* and propose in *h+1*. Things are not so easy in the cases when +*v* cannot take part in consensus because it is late (e.g., it falls behind, it crashes +and recovers, or it just starts after the others). If *v* does not take part, it cannot actively +gather precommit messages (which include vote extensions) in order to decide. +Before ABCI++, this was not a problem: full nodes are supposed to persist past blocks in the block store, +so other nodes would realise that *v* is late and send it the missing decided block at height *h* and +the corresponding commit (kept in block *h+1*) so that *v* can catch up. +However, we cannot apply this catch-up technique for ABCI++, as the vote extensions, which are part +of the needed *extended commit* are not part of the blockchain. + +### Cases to Address + +Before we tackle the description of the possible cases we need to address, let us describe the following +incremental improvement to the ABCI++ logic. Upon decision, a full node persists (e.g., in the block +store) the extended commit that allowed the node to decide. For the moment, let us assume the node only +needs to keep its *most recent* extended commit, and MAY remove any older extended commits from persistent +storage. +This improvement is so obvious that all solutions described in the [Discussion](#discussion) section use +it as a building block. Moreover, it completely addresses by itself some of the cases described in this +subsection. + +We now describe the cases (i.e. possible *runs* of the system) that have been raised in different +discussions and need to be addressed. They are (roughly) ordered from easiest to hardest to deal with. + +- **(a)** *Happy path: all validators advance together, no crash*. + + This case is included for completeness. All validators have taken part in height *h*. + Even if some of them did not manage to send a precommit message for the decided block, they all + receive enough precommit messages to be able to decide. As vote extensions are mandatory in + precommit messages, every validator *v* trivially has all the information, namely the decided block + and the extended commit, needed to propose in height *h+1* for the rounds in which *v* is the + proposer. + + No problem to solve here. + +- **(b)** *All validators advance together, then all crash at the same height*. + + This case has been raised in some discussions, the main concern being whether the vote extensions + for the previous height would be lost across the network. With the improvement described above, + namely persisting the latest extended commit at decision time, this case is solved. + When a crashed validator recovers, it recovers the last extended commit from persistent storage + and handshakes with the Application. + If need be, it also reconstructs messages for the unfinished height + (including all precommits received) from the WAL. + Then, the validator can resume where it was at the time of the crash. Thus, as extensions are + persisted, either in the WAL (in the form of received precommit messages), or in the latest + extended commit, the only way that vote extensions needed to start the next height could be lost + forever would be if all validators crashed and never recovered (e.g. disk corruption). + Since a *correct* node MUST eventually recover, this violates Tendermint's assumption of more than + *2nh/3* correct validators for every height *h*. + + No problem to solve here. + +- **(c)** *Lagging majority*. + + Let us assume the validator set does not change between *h* and *h+1*. + It is not possible by the nature of the Tendermint algorithm, which requires more + than *2nh/3* precommit votes for some round of height *h* in order to make progress. + So, only up to *nh/3* validators can lag behind. + + On the other hand, for the case where there are changes to the validator set between *h* and + *h+1* please see case (d) below, where the extreme case is discussed. + +- **(d)** *Validator set changes completely between* h *and* h+1. + + If sets *valseth* and *valseth+1* are disjoint, + more than *2nh/3* of validators in height *h* should + have actively participated in conensus in *h*. So, as of height *h*, only a minority of validators + in *h* can be lagging behind, although they could all lag behind from *h+1* on, as they are no + longer validators, only full nodes. This situation falls under the assumptions of case (h) below. + + As for validators in *valseth+1*, as they were not validators as of height *h*, they + could all be lagging behind by that time. However, by the time *h* finishes and *h+1* begins, the + chain will halt until more than *2nh+1/3* of them have caught up and started consensus + at height *h+1*. If set *valseth+1* does not change in *h+2* and subsequent + heights, only up to *nh+1/3* validators will be able to lag behind. Thus, we have + converted this case into case (h) below. + +- **(e)** *Enough validators crash to block the rest*. + + In this case, blockchain progress halts, i.e. surviving full nodes keep increasing rounds + indefinitely, until some of the crashed validators are able to recover. + Those validators that recover first will handshake with the Application and recover at the height + they crashed, which is still the same the nodes that did not crash are stuck in, so they don't need + to catch up. + Further, they had persisted the extended commit for the previous height. Nothing to solve. + + For those validators recovering later, we are in case (h) below. + +- **(f)** *Some validators crash, but not enough to block progress*. + + When the correct processes that crashed recover, they handshake with the Application and resume at + the height they were at when they crashed. As the blockchain did not stop making progress, the + recovered processes are likely to have fallen behind with respect to the progressing majority. + + At this point, the recovered processes are in case (h) below. + +- **(g)** *A new full node starts*. + + The reasoning here also applies to the case when more than one full node are starting. + When the full node starts from scratch, it has no state (its current height is 0). Ignoring + statesync for the time being, the node just needs to catch up by applying past blocks one by one + (after verifying them). + + Thus, the node is in case (h) below. + +- **(h)** *Advancing majority, lagging minority* + + In this case, some nodes are late. More precisely, at the present time, a set of full nodes, + denoted *Lhp*, are falling behind + (e.g., temporary disconnection or network partition, memory thrashing, crashes, new nodes) + an arbitrary + number of heights: + between *hs* and *hp*, where *hs < hp*, and + *hp* is the highest height + any correct full node has reached so far. + + The correct full nodes that reached *hp* were able to decide for *hp-1*. + Therefore, less than *nhp-1/3* validators of *hp-1* can be part + of *Lhp*, since enough up-to-date validators needed to actively participate + in consensus for *hp-1*. + + Since, at the present time, + no node in *Lhp* took part in any consensus between + *hs* and *hp-1*, + the reasoning above can be extended to validator set changes between *hs* and + *hp-1*. This results in the following restriction on the full nodes that can be part of *Lhp*. + + - ∀ *h*, where *hs ≤ h < hp*, + | *valseth* ∩ *Lhp* | *< nh/3* + + If this property does not hold for a particular height *h*, where + *hs ≤ h < hp*, Tendermint could not have progressed beyond *h* and + therefore no full node could have reached *hp* (a contradiction). + + These lagging nodes in *Lhp* need to catch up. They have to obtain the + information needed to make + progress from other nodes. For each height *h* between *hs* and *hp-2*, + this includes the decided block for *h*, and the + precommit votes also for *deciding h* (which can be extracted from the block at height *h+1*). + + At a given height *hc* (where possibly *hc << hp*), + a full node in *Lhp* will consider itself *caught up*, based on the + (maybe out of date) information it is getting from its peers. Then, the node needs to be ready to + propose at height *hc+1*, which requires having received the vote extensions for + *hc*. + As the vote extensions are *not* stored in the blocks, and it is difficult to have strong + guarantees on *when* a late node considers itself caught up, providing the late node with the right + vote extensions for the right height poses a problem. + +At this point, we have described and compared all cases raised in discussions leading up to this +RFC. The list above aims at being exhaustive. The analysis of each case included above makes all of +them converge into case (h). + +### Current Catch-up Mechanisms + +We now briefly describe the current catch-up mechanisms in the reactors concerned in Tendermint. + +#### Statesync + +Full nodes optionally run statesync just after starting, when they start from scratch. +If statesync succeeds, an Application snapshot is installed, and Tendermint jumps from height 0 directly +to the height the Application snapshop represents, without applying the block of any previous height. +Some light blocks are received and stored in the block store for running light-client verification of +all the skipped blocks. Light blocks are incomplete blocks, typically containing the header and the +canonical commit but, e.g., no transactions. They are stored in the block store as "signed headers". + +The statesync reactor is not really relevant for solving the problem discussed in this RFC. We will +nevertheless mention it when needed; in particular, to understand some corner cases. + +#### Blocksync + +The blocksync reactor kicks in after start up or recovery (and, optionally, after statesync is done) +and sends the following messages to its peers: + +- `StatusRequest` to query the height its peers are currently at, and +- `BlockRequest`, asking for blocks of heights the local node is missing. + +Using `BlockResponse` messages received from peers, the blocksync reactor validates each received +block using the block of the following height, saves the block in the block store, and sends the +block to the Application for execution. + +If blocksync has validated and applied the block for the height *previous* to the highest seen in +a `StatusResponse` message, or if no progress has been made after a timeout, the node considers +itself as caught up and switches to the consensus reactor. + +#### Consensus Reactor + +The consensus reactor runs the full Tendermint algorithm. For a validator this means it has to +propose blocks, and send/receive prevote/precommit messages, as mandated by Tendermint, before it can +decide and move on to the next height. + +If a full node that is running the consensus reactor falls behind at height *h*, when a peer node +realises this it will retrieve the canonical commit of *h+1* from the block store, and *convert* +it into a set of precommit votes and will send those to the late node. + +## Discussion + +### Solutions Proposed + +These are the solutions proposed in discussions leading up to this RFC. + +- **Solution 0.** *Vote extensions are made **best effort** in the specification*. + + This is the simplest solution, considered as a way to provide vote extensions in a simple enough + way so that it can be part of v0.36. + It consists in changing the specification so as to not *require* that precommit votes used upon + `PrepareProposal` contain their corresponding vote extensions. In other words, we render vote + extensions optional. + There are strong implications stemming from such a relaxation of the original specification. + + - As a vote extension is signed *separately* from the vote it is extending, an intermediate node + can now remove (i.e., censor) vote extensions from precommit messages at will. + - Further, there is no point anymore in the spec requiring the Application to accept a vote extension + passed via `VerifyVoteExtension` to consider a precommit message valid in its entirety. Remember + this behavior of `VerifyVoteExtension` is adding a constraint to Tendermint's conditions for + liveness. + In this situation, it is better and simpler to just drop the vote extension rejected by the + Application via `VerifyVoteExtension`, but still consider the precommit vote itself valid as long + as its signature verifies. + +- **Solution 1.** *Include vote extensions in the blockchain*. + + Another obvious solution, which has somehow been considered in the past, is to include the vote + extensions and their signatures in the blockchain. + The blockchain would thus include the extended commit, rather than a regular commit, as the structure + to be canonicalized in the next block. + With this solution, the current mechanisms implemented both in the blocksync and consensus reactors + would still be correct, as all the information a node needs to catch up, and to start proposing when + it considers itself as caught-up, can now be recovered from past blocks saved in the block store. + + This solution has two main drawbacks. + + - As the block format must change, upgrading a chain requires a hard fork. Furthermore, + all existing light client implementations will stop working until they are upgraded to deal with + the new format (e.g., how certain hashes calculated and/or how certain signatures are checked). + For instance, let us consider IBC, which relies on light clients. An IBC connection between + two chains will be broken if only one chain upgrades. + - The extra information (i.e., the vote extensions) that is now kept in the blockchain is not really + needed *at every height* for a late node to catch up. + - This information is only needed to be able to *propose* at the height the validator considers + itself as caught-up. If a validator is indeed late for height *h*, it is useless (although + correct) for it to call `PrepareProposal`, or `ExtendVote`, since the block is already decided. + - Moreover, some use cases require pretty sizeable vote extensions, which would result in an + important waste of space in the blockchain. + +- **Solution 2.** *Skip* propose *step in Tendermint algorithm*. + + This solution consists in modifying the Tendermint algorithm to skip the *send proposal* step in + heights where the node does not have the required vote extensions to populate the call to + `PrepareProposal`. The main idea behind this is that it should only happen when the validator is late + and, therefore, up-to-date validators have already proposed (and decided) for that height. + A small variation of this solution is, rather than skipping the *send proposal* step, the validator + sends a special *empty* or *bottom* (⊥) proposal to signal other nodes that it is not ready to propose + at (any round of) the current height. + + The appeal of this solution is its simplicity. A possible implementation does not need to extend + the data structures, or change the current catch-up mechanisms implemented in the blocksync or + in the consensus reactor. When we lack the needed information (vote extensions), we simply rely + on another correct validator to propose a valid block in other rounds of the current height. + + However, this solution can be attacked by a byzantine node in the network in the following way. + Let us consider the following scenario: + + - all validators in *valseth* send out precommit messages, with vote extensions, + for height *h*, round 0, roughly at the same time, + - all those precommit messages contain non-`nil` precommit votes, which vote for block *b* + - all those precommit messages sent in height *h*, round 0, and all messages sent in + height *h*, round *r > 0* get delayed indefinitely, so, + - all validators in *valseth* keep waiting for enough precommit + messages for height *h*, round 0, needed for deciding in height *h* + - an intermediate (malicious) full node *m* manages to receive block *b*, and gather more than + *2nh/3* precommit messages for height *h*, round 0, + - one way or another, the solution should have either (a) a mechanism for a full node to *tell* + another full node it is late, or (b) a mechanism for a full node to conclude it is late based + on other full nodes' messages; any of these mechanisms should, at the very least, + require the late node receiving the decided block and a commit (not necessarily an extended + commit) for *h*, + - node *m* uses the gathered precommit messages to build a commit for height *h*, round 0, + - in order to convince full nodes that they are late, node *m* either (a) *tells* them they + are late, or (b) shows them it (i.e. *m*) is ahead, by sending them block *b*, along with the + commit for height *h*, round 0, + - all full nodes conclude they are late from *m*'s behavior, and use block *b* and the commit for + height *h*, round 0, to decide on height *h*, and proceed to height *h+1*. + + At this point, *all* full nodes, including all validators in *valseth+1*, have advanced + to height *h+1* believing they are late, and so, expecting the *hypothetical* leading majority of + validators in *valseth+1* to propose for *h+1*. As a result, the blockhain + grinds to a halt. + A (rather complex) ad-hoc mechanism would need to be carried out by node operators to roll + back all validators to the precommit step of height *h*, round *r*, so that they can regenerate + vote extensions (remember vote extensions are non-deterministic) and continue execution. + +- **Solution 3.** *Require extended commits to be available at switching time*. + + This one is more involved than all previous solutions, and builds on an idea present in Solution 2: + vote extensions are actually not needed for Tendermint to make progress as long as the + validator is *certain* it is late. + + We define two modes. The first is denoted *catch-up mode*, and Tendermint only calls + `FinalizeBlock` for each height when in this mode. The second is denoted *consensus mode*, in + which the validator considers itself up to date and fully participates in consensus and calls + `PrepareProposal`/`ProcessProposal`, `ExtendVote`, and `VerifyVoteExtension`, before calling + `FinalizeBlock`. + + The catch-up mode does not need vote extension information to make progress, as all it needs is the + decided block at each height to call `FinalizeBlock` and keep the state-machine replication making + progress. The consensus mode, on the other hand, does need vote extension information when + starting every height. + + Validators are in consensus mode by default. When a validator in consensus mode falls behind + for whatever reason, e.g. cases (b), (d), (e), (f), (g), or (h) above, we introduce the following + key safety property: + + - for every height *hp*, a full node *f* in *hp* refuses to switch to catch-up + mode **until** there exists a height *h'* such that: + - *p* has received and (light-client) verified the blocks of + all heights *h*, where *hp ≤ h ≤ h'* + - it has received an extended commit for *h'* and has verified: + - the precommit vote signatures in the extended commit + - the vote extension signatures in the extended commit: each is signed with the same + key as the precommit vote it extends + + If the condition above holds for *hp*, namely receiving a valid sequence of blocks in + the *f*'s future, and an extended commit corresponding to the last block in the sequence, then + node *f*: + + - switches to catch-up mode, + - applies all blocks between *hp* and *h'* (calling `FinalizeBlock` only), and + - switches back to consensus mode using the extended commit for *h'* to propose in the rounds of + *h' + 1* where it is the proposer. + + This mechanism, together with the invariant it uses, ensures that the node cannot be attacked by + being fed a block without extensions to make it believe it is late, in a similar way as explained + for Solution 2. + +### Feasibility of the Proposed Solutions + +Solution 0, besides the drawbacks described in the previous section, provides guarantees that are +weaker than the rest. The Application does not have the assurance that more than *2nh/3* vote +extensions will *always* be available when calling `PrepareProposal` at height *h+1*. +This level of guarantees is probably not strong enough for vote extensions to be useful for some +important use cases that motivated them in the first place, e.g., encrypted mempool transactions. + +Solution 1, while being simple in that the changes needed in the current Tendermint codebase would +be rather small, is changing the block format, and would therefore require all blockchains using +Tendermint v0.35 or earlier to hard-fork when upgrading to v0.36. + +Since Solution 2 can be attacked, one might prefer Solution 3, even if it is more involved +to implement. Further, we must elaborate on how we can turn Solution 3, described in abstract +terms in the previous section, into a concrete implementation compatible with the current +Tendermint codebase. + +### Current Limitations and Possible Implementations + +The main limitations affecting the current version of Tendermint are the following. + +- The current version of the blocksync reactor does not use the full + [light client verification](https://github.com/tendermint/tendermint/blob/4743a7ad0/spec/light-client/README.md) + algorithm to validate blocks coming from other peers. +- The code being structured into the blocksync and consensus reactors, only switching from the + blocksync reactor to the consensus reactor is supported; switching in the opposite direction is + not supported. Alternatively, the consensus reactor could have a mechanism allowing a late node + to catch up by skipping calls to `PrepareProposal`/`ProcessProposal`, and + `ExtendVote`/`VerifyVoteExtension` and only calling `FinalizeBlock` for each height. + Such a mechanism does not exist at the time of writing this RFC. + +The blocksync reactor featuring light client verification is being actively worked on (tentatively +for v0.37). So it is best if this RFC does not try to delve into that problem, but just makes sure +its outcomes are compatible with that effort. + +In subsection [Cases to Address](#cases-to-address), we concluded that we can focus on +solving case (h) in theoretical terms. +However, as the current Tendermint version does not yet support switching back to blocksync once a +node has switched to consensus, we need to split case (h) into two cases. When a full node needs to +catch up... + +- **(h.1)** ... it has not switched yet from the blocksync reactor to the consensus reactor, or + +- **(h.2)** ... it has already switched to the consensus reactor. + +This is important in order to discuss the different possible implementations. + +#### Base Implementation: Persist and Propagate Extended Commit History + +In order to circumvent the fact that we cannot switch from the consensus reactor back to blocksync, +rather than just keeping the few most recent extended commits, nodes will need to keep +and gossip a backlog of extended commits so that the consensus reactor can still propose and decide +in out-of-date heights (even if those proposals will be useless). + +The base implementation - for which an experimental patch exists - consists in the conservative +approach of persisting in the block store *all* extended commits for which we have also stored +the full block. Currently, when statesync is run at startup, it saves light blocks. +This base implementation does not seek +to receive or persist extended commits for those light blocks as they would not be of any use. + +Then, we modify the blocksync reactor so that peers *always* send requested full blocks together +with the corresponding extended commit in the `BlockResponse` messages. This guarantees that the +block store being reconstructed by blocksync has the same information as that of peers that are +up to date (at least starting from the latest snapshot applied by statesync before starting blocksync). +Thus, blocksync has all the data it requires to switch to the consensus reactor, as long as one of +the following exit conditions are met: + +- The node is still at height 0 (where no commit or extended commit is needed) +- The node has processed at least 1 block in blocksync + +The second condition is needed in case the node has installed an Application snapshot during statesync. +If that is the case, at the time blocksync starts, the block store only has the data statesync has saved: +light blocks, and no extended commits. +Hence we need to blocksync at least one block from another node, which will be sent with its corresponding extended commit, before we can switch to consensus. + +As a side note, a chain might be started at a height *hi > 0*, all other heights +*h < hi* being non-existent. In this case, the chain is still considered to be at height 0 before +block *hi* is applied, so the first condition above allows the node to switch to consensus even +if blocksync has not processed any block (which is always the case if all nodes are starting from scratch). + +When a validator falls behind while having already switched to the consensus reactor, a peer node can +simply retrieve the extended commit for the required height from the block store and reconstruct a set of +precommit votes together with their extensions and send them in the form of precommit messages to the +validator falling behind, regardless of whether the peer node holds the extended commit because it +actually participated in that consensus and thus received the precommit messages, or it received the extended commit via a `BlockResponse` message while running blocksync. + +This solution requires a few changes to the consensus reactor: + +- upon saving the block for a given height in the block store at decision time, save the + corresponding extended commit as well +- in the catch-up mechanism, when a node realizes that another peer is more than 2 heights + behind, it uses the extended commit (rather than the canoncial commit as done previously) to + reconstruct the precommit votes with their corresponding extensions + +The changes to the blocksync reactor are more substantial: + +- the `BlockResponse` message is extended to include the extended commit of the same height as + the block included in the response (just as they are stored in the block store) +- structure `bpRequester` is likewise extended to hold the received extended commits coming in + `BlockResponse` messages +- method `PeekTwoBlocks` is modified to also return the extended commit corresponding to the first block +- when successfully verifying a received block, the reactor saves its corresponding extended commit in + the block store + +The two main drawbacks of this base implementation are: + +- the increased size taken by the block store, in particular with big extensions +- the increased bandwith taken by the new format of `BlockResponse` + +#### Possible Optimization: Pruning the Extended Commit History + +If we cannot switch from the consensus reactor back to the blocksync reactor we cannot prune the extended commit backlog in the block store without sacrificing the implementation's correctness. The asynchronous +nature of our distributed system model allows a process to fall behing an arbitrary number of +heights, and thus all extended commits need to be kept *just in case* a node that late had +previously switched to the consensus reactor. + +However, there is a possibility to optimize the base implementation. Every time we enter a new height, +we could prune from the block store all extended commits that are more than *d* heights in the past. +Then, we need to handle two new situations, roughly equivalent to cases (h.1) and (h.2) described above. + +- (h.1) A node starts from scratch or recovers after a crash. In thisy case, we need to modify the + blocksync reactor's base implementation. + - when receiving a `BlockResponse` message, it MUST accept that the extended commit set to `nil`, + - when sending a `BlockResponse` message, if the block store contains the extended commit for that + height, it MUST set it in the message, otherwise it sets it to `nil`, + - the exit conditions used for the base implementation are no longer valid; the only reliable exit + condition now consists in making sure that the last block processed by blocksync was received with + the corresponding commit, and not `nil`; this extended commit will allow the node to switch from + the blocksync reactor to the consensus reactor and immediately act as a proposer if required. +- (h.2) A node already running the consensus reactor falls behind beyond *d* heights. In principle, + the node will be stuck forever as no other node can provide the vote extensions it needs to make + progress (they all have pruned the corresponding extended commit). + However we can manually have the node crash and recover as a workaround. This effectively converts + this case into (h.1). + +### Formalization Work + +A formalization work to show or prove the correctness of the different use cases and solutions +presented here (and any other that may be found) needs to be carried out. +A question that needs a precise answer is how many extended commits (one?, two?) a node needs +to keep in persistent memory when implementing Solution 3 described above without Tendermint's +current limitations. +Another important invariant we need to prove formally is that the set of vote extensions +required to make progress will always be held somewhere in the network. + +## References + +- [ABCI++ specification](https://github.com/tendermint/tendermint/blob/4743a7ad0/spec/abci%2B%2B/README.md) +- [ABCI as of v0.35](https://github.com/tendermint/spec/blob/4fb99af/spec/abci/README.md) +- [Vote extensions issue](https://github.com/tendermint/tendermint/issues/8174) +- [Light client verification](https://github.com/tendermint/tendermint/blob/4743a7ad0/spec/light-client/README.md) diff --git a/internal/state/indexer/metrics.gen.go b/internal/state/indexer/metrics.gen.go new file mode 100644 index 000000000..8b079d8d5 --- /dev/null +++ b/internal/state/indexer/metrics.gen.go @@ -0,0 +1,51 @@ +// Code generated by metricsgen. DO NOT EDIT. + +package indexer + +import ( + "github.com/go-kit/kit/metrics/discard" + prometheus "github.com/go-kit/kit/metrics/prometheus" + stdprometheus "github.com/prometheus/client_golang/prometheus" +) + +func PrometheusMetrics(namespace string, labelsAndValues ...string) *Metrics { + labels := []string{} + for i := 0; i < len(labelsAndValues); i += 2 { + labels = append(labels, labelsAndValues[i]) + } + return &Metrics{ + BlockEventsSeconds: prometheus.NewHistogramFrom(stdprometheus.HistogramOpts{ + Namespace: namespace, + Subsystem: MetricsSubsystem, + Name: "block_events_seconds", + Help: "Latency for indexing block events.", + }, labels).With(labelsAndValues...), + TxEventsSeconds: prometheus.NewHistogramFrom(stdprometheus.HistogramOpts{ + Namespace: namespace, + Subsystem: MetricsSubsystem, + Name: "tx_events_seconds", + Help: "Latency for indexing transaction events.", + }, labels).With(labelsAndValues...), + BlocksIndexed: prometheus.NewCounterFrom(stdprometheus.CounterOpts{ + Namespace: namespace, + Subsystem: MetricsSubsystem, + Name: "blocks_indexed", + Help: "Number of complete blocks indexed.", + }, labels).With(labelsAndValues...), + TransactionsIndexed: prometheus.NewCounterFrom(stdprometheus.CounterOpts{ + Namespace: namespace, + Subsystem: MetricsSubsystem, + Name: "transactions_indexed", + Help: "Number of transactions indexed.", + }, labels).With(labelsAndValues...), + } +} + +func NopMetrics() *Metrics { + return &Metrics{ + BlockEventsSeconds: discard.NewHistogram(), + TxEventsSeconds: discard.NewHistogram(), + BlocksIndexed: discard.NewCounter(), + TransactionsIndexed: discard.NewCounter(), + } +} diff --git a/internal/state/indexer/metrics.go b/internal/state/indexer/metrics.go index aa64a4bb2..0b92b879e 100644 --- a/internal/state/indexer/metrics.go +++ b/internal/state/indexer/metrics.go @@ -2,12 +2,10 @@ package indexer import ( "github.com/go-kit/kit/metrics" - "github.com/go-kit/kit/metrics/discard" - - prometheus "github.com/go-kit/kit/metrics/prometheus" - stdprometheus "github.com/prometheus/client_golang/prometheus" ) +//go:generate go run github.com/tendermint/tendermint/scripts/metricsgen -struct=Metrics + // MetricsSubsystem is a the subsystem label for the indexer package. const MetricsSubsystem = "indexer" @@ -25,49 +23,3 @@ type Metrics struct { // Number of transactions indexed. TransactionsIndexed metrics.Counter } - -// PrometheusMetrics returns Metrics build using Prometheus client library. -// Optionally, labels can be provided along with their values ("foo", -// "fooValue"). -func PrometheusMetrics(namespace string, labelsAndValues ...string) *Metrics { - labels := []string{} - for i := 0; i < len(labelsAndValues); i += 2 { - labels = append(labels, labelsAndValues[i]) - } - return &Metrics{ - BlockEventsSeconds: prometheus.NewHistogramFrom(stdprometheus.HistogramOpts{ - Namespace: namespace, - Subsystem: MetricsSubsystem, - Name: "block_events_seconds", - Help: "Latency for indexing block events.", - }, labels).With(labelsAndValues...), - TxEventsSeconds: prometheus.NewHistogramFrom(stdprometheus.HistogramOpts{ - Namespace: namespace, - Subsystem: MetricsSubsystem, - Name: "tx_events_seconds", - Help: "Latency for indexing transaction events.", - }, labels).With(labelsAndValues...), - BlocksIndexed: prometheus.NewCounterFrom(stdprometheus.CounterOpts{ - Namespace: namespace, - Subsystem: MetricsSubsystem, - Name: "blocks_indexed", - Help: "Number of complete blocks indexed.", - }, labels).With(labelsAndValues...), - TransactionsIndexed: prometheus.NewCounterFrom(stdprometheus.CounterOpts{ - Namespace: namespace, - Subsystem: MetricsSubsystem, - Name: "transactions_indexed", - Help: "Number of transactions indexed.", - }, labels).With(labelsAndValues...), - } -} - -// NopMetrics returns an indexer metrics stub that discards all samples. -func NopMetrics() *Metrics { - return &Metrics{ - BlockEventsSeconds: discard.NewHistogram(), - TxEventsSeconds: discard.NewHistogram(), - BlocksIndexed: discard.NewCounter(), - TransactionsIndexed: discard.NewCounter(), - } -} diff --git a/scripts/metricsgen/metricsgen.go b/scripts/metricsgen/metricsgen.go new file mode 100644 index 000000000..70cb36a77 --- /dev/null +++ b/scripts/metricsgen/metricsgen.go @@ -0,0 +1,334 @@ +// metricsgen is a code generation tool for creating constructors for Tendermint +// metrics types. +package main + +import ( + "bytes" + "flag" + "fmt" + "go/ast" + "go/format" + "go/parser" + "go/token" + "go/types" + "io" + "io/fs" + "log" + "os" + "path" + "path/filepath" + "reflect" + "regexp" + "strconv" + "strings" + "text/template" +) + +func init() { + flag.Usage = func() { + fmt.Fprintf(os.Stderr, `Usage: %[1]s -struct + +Generate constructors for the metrics type specified by -struct contained in +the current directory. The tool creates a new file in the current directory +containing the generated code. + +Options: +`, filepath.Base(os.Args[0])) + flag.PrintDefaults() + } +} + +const metricsPackageName = "github.com/go-kit/kit/metrics" + +const ( + metricNameTag = "metrics_name" + labelsTag = "metrics_labels" + bucketTypeTag = "metrics_buckettype" + bucketSizeTag = "metrics_bucketsizes" +) + +var ( + dir = flag.String("dir", ".", "Path to the directory containing the target package") + strct = flag.String("struct", "Metrics", "Struct to parse for metrics") +) + +var bucketType = map[string]string{ + "exprange": "stdprometheus.ExponentialBucketsRange", + "exp": "stdprometheus.ExponentialBuckets", + "lin": "stdprometheus.LinearBuckets", +} + +var tmpl = template.Must(template.New("tmpl").Parse(`// Code generated by metricsgen. DO NOT EDIT. + +package {{ .Package }} + +import ( + "github.com/go-kit/kit/metrics/discard" + prometheus "github.com/go-kit/kit/metrics/prometheus" + stdprometheus "github.com/prometheus/client_golang/prometheus" +) + +func PrometheusMetrics(namespace string, labelsAndValues...string) *Metrics { + labels := []string{} + for i := 0; i < len(labelsAndValues); i += 2 { + labels = append(labels, labelsAndValues[i]) + } + return &Metrics{ + {{ range $metric := .ParsedMetrics }} + {{- $metric.FieldName }}: prometheus.New{{ $metric.TypeName }}From(stdprometheus.{{$metric.TypeName }}Opts{ + Namespace: namespace, + Subsystem: MetricsSubsystem, + Name: "{{$metric.MetricName }}", + Help: "{{ $metric.Description }}", + {{ if ne $metric.HistogramOptions.BucketType "" }} + Buckets: {{ $metric.HistogramOptions.BucketType }}({{ $metric.HistogramOptions.BucketSizes }}), + {{ else if ne $metric.HistogramOptions.BucketSizes "" }} + Buckets: []float64{ {{ $metric.HistogramOptions.BucketSizes }} }, + {{ end }} + {{- if eq (len $metric.Labels) 0 }} + }, labels).With(labelsAndValues...), + {{ else }} + }, append(labels, {{$metric.Labels | printf "%q" }})).With(labelsAndValues...), + {{- end }} + {{- end }} + } +} + + +func NopMetrics() *Metrics { + return &Metrics{ + {{- range $metric := .ParsedMetrics }} + {{ $metric.FieldName }}: discard.New{{ $metric.TypeName }}(), + {{- end }} + } +} +`)) + +// ParsedMetricField is the data parsed for a single field of a metric struct. +type ParsedMetricField struct { + TypeName string + FieldName string + MetricName string + Description string + Labels string + + HistogramOptions HistogramOpts +} + +type HistogramOpts struct { + BucketType string + BucketSizes string +} + +// TemplateData is all of the data required for rendering a metric file template. +type TemplateData struct { + Package string + ParsedMetrics []ParsedMetricField +} + +func main() { + flag.Parse() + if *strct == "" { + log.Fatal("You must specify a non-empty -struct") + } + td, err := ParseMetricsDir(".", *strct) + if err != nil { + log.Fatalf("Parsing file: %v", err) + } + out := filepath.Join(*dir, "metrics.gen.go") + f, err := os.Create(out) + if err != nil { + log.Fatalf("Opening file: %v", err) + } + err = GenerateMetricsFile(f, td) + if err != nil { + log.Fatalf("Generating code: %v", err) + } +} +func ignoreTestFiles(f fs.FileInfo) bool { + return !strings.Contains(f.Name(), "_test.go") +} + +// ParseMetricsDir parses the dir and scans for a struct matching structName, +// ignoring all test files. ParseMetricsDir iterates the fields of the metrics +// struct and builds a TemplateData using the data obtained from the abstract syntax tree. +func ParseMetricsDir(dir string, structName string) (TemplateData, error) { + fs := token.NewFileSet() + d, err := parser.ParseDir(fs, dir, ignoreTestFiles, parser.ParseComments) + if err != nil { + return TemplateData{}, err + } + if len(d) > 1 { + return TemplateData{}, fmt.Errorf("multiple packages found in %s", dir) + } + if len(d) == 0 { + return TemplateData{}, fmt.Errorf("no go pacakges found in %s", dir) + } + + // Grab the package name. + var pkgName string + var pkg *ast.Package + for pkgName, pkg = range d { + } + td := TemplateData{ + Package: pkgName, + } + // Grab the metrics struct + m, mPkgName, err := findMetricsStruct(pkg.Files, structName) + if err != nil { + return TemplateData{}, err + } + for _, f := range m.Fields.List { + if !isMetric(f.Type, mPkgName) { + continue + } + pmf := parseMetricField(f) + td.ParsedMetrics = append(td.ParsedMetrics, pmf) + } + + return td, err +} + +// GenerateMetricsFile executes the metrics file template, writing the result +// into the io.Writer. +func GenerateMetricsFile(w io.Writer, td TemplateData) error { + b := []byte{} + buf := bytes.NewBuffer(b) + err := tmpl.Execute(buf, td) + if err != nil { + return err + } + b, err = format.Source(buf.Bytes()) + if err != nil { + return err + } + _, err = io.Copy(w, bytes.NewBuffer(b)) + if err != nil { + return err + } + return nil +} + +func findMetricsStruct(files map[string]*ast.File, structName string) (*ast.StructType, string, error) { + var ( + st *ast.StructType + ) + for _, file := range files { + mPkgName, err := extractMetricsPackageName(file.Imports) + if err != nil { + return nil, "", fmt.Errorf("unable to determine metrics package name: %v", err) + } + if !ast.FilterFile(file, func(name string) bool { + return name == structName + }) { + continue + } + ast.Inspect(file, func(n ast.Node) bool { + switch f := n.(type) { + case *ast.TypeSpec: + if f.Name.Name == structName { + var ok bool + st, ok = f.Type.(*ast.StructType) + if !ok { + err = fmt.Errorf("found identifier for %q of wrong type", structName) + } + } + return false + default: + return true + } + }) + if err != nil { + return nil, "", err + } + if st != nil { + return st, mPkgName, nil + } + } + return nil, "", fmt.Errorf("target struct %q not found in dir", structName) +} + +func parseMetricField(f *ast.Field) ParsedMetricField { + var comment string + if f.Doc != nil { + for _, c := range f.Doc.List { + comment += strings.TrimPrefix(c.Text, "// ") + } + } + pmf := ParsedMetricField{ + Description: comment, + MetricName: extractFieldName(f.Names[0].String(), f.Tag), + FieldName: f.Names[0].String(), + TypeName: extractTypeName(f.Type), + Labels: extractLabels(f.Tag), + } + if pmf.TypeName == "Histogram" { + pmf.HistogramOptions = extractHistogramOptions(f.Tag) + } + return pmf +} + +func extractTypeName(e ast.Expr) string { + return strings.TrimPrefix(path.Ext(types.ExprString(e)), ".") +} + +func isMetric(e ast.Expr, mPkgName string) bool { + return strings.Contains(types.ExprString(e), fmt.Sprintf("%s.", mPkgName)) +} + +func extractLabels(bl *ast.BasicLit) string { + if bl != nil { + t := reflect.StructTag(strings.Trim(bl.Value, "`")) + if v := t.Get(labelsTag); v != "" { + return v + } + } + return "" +} + +func extractFieldName(name string, tag *ast.BasicLit) string { + if tag != nil { + t := reflect.StructTag(strings.Trim(tag.Value, "`")) + if v := t.Get(metricNameTag); v != "" { + return v + } + } + return toSnakeCase(name) +} + +func extractHistogramOptions(tag *ast.BasicLit) HistogramOpts { + h := HistogramOpts{} + if tag != nil { + t := reflect.StructTag(strings.Trim(tag.Value, "`")) + if v := t.Get(bucketTypeTag); v != "" { + h.BucketType = bucketType[v] + } + if v := t.Get(bucketSizeTag); v != "" { + h.BucketSizes = v + } + } + return h +} + +func extractMetricsPackageName(imports []*ast.ImportSpec) (string, error) { + for _, i := range imports { + u, err := strconv.Unquote(i.Path.Value) + if err != nil { + return "", err + } + if u == metricsPackageName { + if i.Name != nil { + return i.Name.Name, nil + } + return path.Base(u), nil + } + } + return "", nil +} + +var capitalChange = regexp.MustCompile("([a-z0-9])([A-Z])") + +func toSnakeCase(str string) string { + snake := capitalChange.ReplaceAllString(str, "${1}_${2}") + return strings.ToLower(snake) +} diff --git a/scripts/metricsgen/metricsgen_test.go b/scripts/metricsgen/metricsgen_test.go new file mode 100644 index 000000000..83251e651 --- /dev/null +++ b/scripts/metricsgen/metricsgen_test.go @@ -0,0 +1,259 @@ +package main_test + +import ( + "bytes" + "fmt" + "go/parser" + "go/token" + "io" + "io/ioutil" + "os" + "path" + "path/filepath" + "testing" + + "github.com/stretchr/testify/require" + metricsgen "github.com/tendermint/tendermint/scripts/metricsgen" +) + +const testDataDir = "./testdata" + +func TestSimpleTemplate(t *testing.T) { + m := metricsgen.ParsedMetricField{ + TypeName: "Histogram", + FieldName: "MyMetric", + MetricName: "request_count", + Description: "how many requests were made since the start of the process", + Labels: "first, second, third", + } + td := metricsgen.TemplateData{ + Package: "mypack", + ParsedMetrics: []metricsgen.ParsedMetricField{m}, + } + b := bytes.NewBuffer([]byte{}) + err := metricsgen.GenerateMetricsFile(b, td) + if err != nil { + t.Fatalf("unable to parse template %v", err) + } +} + +func TestFromData(t *testing.T) { + infos, err := ioutil.ReadDir(testDataDir) + if err != nil { + t.Fatalf("unable to open file %v", err) + } + for _, dir := range infos { + t.Run(dir.Name(), func(t *testing.T) { + if !dir.IsDir() { + t.Fatalf("expected file %s to be directory", dir.Name()) + } + dirName := path.Join(testDataDir, dir.Name()) + pt, err := metricsgen.ParseMetricsDir(dirName, "Metrics") + if err != nil { + t.Fatalf("unable to parse from dir %q: %v", dir, err) + } + outFile := path.Join(dirName, "out.go") + if err != nil { + t.Fatalf("unable to open file %s: %v", outFile, err) + } + of, err := os.Create(outFile) + if err != nil { + t.Fatalf("unable to open file %s: %v", outFile, err) + } + defer os.Remove(outFile) + if err := metricsgen.GenerateMetricsFile(of, pt); err != nil { + t.Fatalf("unable to generate metrics file %s: %v", outFile, err) + } + if _, err := parser.ParseFile(token.NewFileSet(), outFile, nil, parser.AllErrors); err != nil { + t.Fatalf("unable to parse generated file %s: %v", outFile, err) + } + bNew, err := ioutil.ReadFile(outFile) + if err != nil { + t.Fatalf("unable to read generated file %s: %v", outFile, err) + } + goldenFile := path.Join(dirName, "metrics.gen.go") + bOld, err := ioutil.ReadFile(goldenFile) + if err != nil { + t.Fatalf("unable to read file %s: %v", goldenFile, err) + } + if !bytes.Equal(bNew, bOld) { + t.Fatalf("newly generated code in file %s does not match golden file %s\n"+ + "if the output of the metricsgen tool is expected to change run the following make target: \n"+ + "\tmake metrics", outFile, goldenFile) + } + }) + } +} + +func TestParseMetricsStruct(t *testing.T) { + const pkgName = "mypkg" + metricsTests := []struct { + name string + shouldError bool + metricsStruct string + expected metricsgen.TemplateData + }{ + { + name: "basic", + metricsStruct: `type Metrics struct { + myGauge metrics.Gauge + }`, + expected: metricsgen.TemplateData{ + Package: pkgName, + ParsedMetrics: []metricsgen.ParsedMetricField{ + { + TypeName: "Gauge", + FieldName: "myGauge", + MetricName: "my_gauge", + }, + }, + }, + }, + { + name: "histogram", + metricsStruct: "type Metrics struct {\n" + + "myHistogram metrics.Histogram `metrics_buckettype:\"exp\" metrics_bucketsizes:\"1, 100, .8\"`\n" + + "}", + expected: metricsgen.TemplateData{ + Package: pkgName, + ParsedMetrics: []metricsgen.ParsedMetricField{ + { + TypeName: "Histogram", + FieldName: "myHistogram", + MetricName: "my_histogram", + + HistogramOptions: metricsgen.HistogramOpts{ + BucketType: "stdprometheus.ExponentialBuckets", + BucketSizes: "1, 100, .8", + }, + }, + }, + }, + }, + { + name: "labeled name", + metricsStruct: "type Metrics struct {\n" + + "myCounter metrics.Counter `metrics_name:\"new_name\"`\n" + + "}", + expected: metricsgen.TemplateData{ + Package: pkgName, + ParsedMetrics: []metricsgen.ParsedMetricField{ + { + TypeName: "Counter", + FieldName: "myCounter", + MetricName: "new_name", + }, + }, + }, + }, + { + name: "metric labels", + metricsStruct: "type Metrics struct {\n" + + "myCounter metrics.Counter `metrics_labels:\"label1, label2\"`\n" + + "}", + expected: metricsgen.TemplateData{ + Package: pkgName, + ParsedMetrics: []metricsgen.ParsedMetricField{ + { + TypeName: "Counter", + FieldName: "myCounter", + MetricName: "my_counter", + Labels: "label1, label2", + }, + }, + }, + }, + { + name: "ignore non-metric field", + metricsStruct: `type Metrics struct { + myCounter metrics.Counter + nonMetric string + }`, + expected: metricsgen.TemplateData{ + Package: pkgName, + ParsedMetrics: []metricsgen.ParsedMetricField{ + { + TypeName: "Counter", + FieldName: "myCounter", + MetricName: "my_counter", + }, + }, + }, + }, + } + for _, testCase := range metricsTests { + t.Run(testCase.name, func(t *testing.T) { + dir, err := os.MkdirTemp(os.TempDir(), "metricsdir") + if err != nil { + t.Fatalf("unable to create directory: %v", err) + } + defer os.Remove(dir) + f, err := os.Create(filepath.Join(dir, "metrics.go")) + if err != nil { + t.Fatalf("unable to open file: %v", err) + } + pkgLine := fmt.Sprintf("package %s\n", pkgName) + importClause := ` + import( + "github.com/go-kit/kit/metrics" + ) + ` + + _, err = io.WriteString(f, pkgLine) + require.NoError(t, err) + _, err = io.WriteString(f, importClause) + require.NoError(t, err) + _, err = io.WriteString(f, testCase.metricsStruct) + require.NoError(t, err) + + td, err := metricsgen.ParseMetricsDir(dir, "Metrics") + if testCase.shouldError { + require.Error(t, err) + } else { + require.NoError(t, err) + require.Equal(t, testCase.expected, td) + } + }) + } +} + +func TestParseAliasedMetric(t *testing.T) { + aliasedData := ` + package mypkg + + import( + mymetrics "github.com/go-kit/kit/metrics" + ) + type Metrics struct { + m mymetrics.Gauge + } + ` + dir, err := os.MkdirTemp(os.TempDir(), "metricsdir") + if err != nil { + t.Fatalf("unable to create directory: %v", err) + } + defer os.Remove(dir) + f, err := os.Create(filepath.Join(dir, "metrics.go")) + if err != nil { + t.Fatalf("unable to open file: %v", err) + } + _, err = io.WriteString(f, aliasedData) + if err != nil { + t.Fatalf("unable to write to file: %v", err) + } + td, err := metricsgen.ParseMetricsDir(dir, "Metrics") + require.NoError(t, err) + + expected := + metricsgen.TemplateData{ + Package: "mypkg", + ParsedMetrics: []metricsgen.ParsedMetricField{ + { + TypeName: "Gauge", + FieldName: "m", + MetricName: "m", + }, + }, + } + require.Equal(t, expected, td) +} diff --git a/scripts/metricsgen/testdata/basic/metrics.gen.go b/scripts/metricsgen/testdata/basic/metrics.gen.go new file mode 100644 index 000000000..d541cb2db --- /dev/null +++ b/scripts/metricsgen/testdata/basic/metrics.gen.go @@ -0,0 +1,30 @@ +// Code generated by metricsgen. DO NOT EDIT. + +package basic + +import ( + "github.com/go-kit/kit/metrics/discard" + prometheus "github.com/go-kit/kit/metrics/prometheus" + stdprometheus "github.com/prometheus/client_golang/prometheus" +) + +func PrometheusMetrics(namespace string, labelsAndValues ...string) *Metrics { + labels := []string{} + for i := 0; i < len(labelsAndValues); i += 2 { + labels = append(labels, labelsAndValues[i]) + } + return &Metrics{ + Height: prometheus.NewGaugeFrom(stdprometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: MetricsSubsystem, + Name: "height", + Help: "simple metric that tracks the height of the chain.", + }, labels).With(labelsAndValues...), + } +} + +func NopMetrics() *Metrics { + return &Metrics{ + Height: discard.NewGauge(), + } +} diff --git a/scripts/metricsgen/testdata/basic/metrics.go b/scripts/metricsgen/testdata/basic/metrics.go new file mode 100644 index 000000000..1a361f90f --- /dev/null +++ b/scripts/metricsgen/testdata/basic/metrics.go @@ -0,0 +1,11 @@ +package basic + +import "github.com/go-kit/kit/metrics" + +//go:generate go run ../../../../scripts/metricsgen -struct=Metrics + +// Metrics contains metrics exposed by this package. +type Metrics struct { + // simple metric that tracks the height of the chain. + Height metrics.Gauge +} diff --git a/scripts/metricsgen/testdata/commented/metrics.gen.go b/scripts/metricsgen/testdata/commented/metrics.gen.go new file mode 100644 index 000000000..038da3d46 --- /dev/null +++ b/scripts/metricsgen/testdata/commented/metrics.gen.go @@ -0,0 +1,30 @@ +// Code generated by metricsgen. DO NOT EDIT. + +package commented + +import ( + "github.com/go-kit/kit/metrics/discard" + prometheus "github.com/go-kit/kit/metrics/prometheus" + stdprometheus "github.com/prometheus/client_golang/prometheus" +) + +func PrometheusMetrics(namespace string, labelsAndValues ...string) *Metrics { + labels := []string{} + for i := 0; i < len(labelsAndValues); i += 2 { + labels = append(labels, labelsAndValues[i]) + } + return &Metrics{ + Field: prometheus.NewGaugeFrom(stdprometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: MetricsSubsystem, + Name: "field", + Help: "Height of the chain.We expect multi-line comments to parse correctly.", + }, labels).With(labelsAndValues...), + } +} + +func NopMetrics() *Metrics { + return &Metrics{ + Field: discard.NewGauge(), + } +} diff --git a/scripts/metricsgen/testdata/commented/metrics.go b/scripts/metricsgen/testdata/commented/metrics.go new file mode 100644 index 000000000..174f1e233 --- /dev/null +++ b/scripts/metricsgen/testdata/commented/metrics.go @@ -0,0 +1,11 @@ +package commented + +import "github.com/go-kit/kit/metrics" + +//go:generate go run ../../../../scripts/metricsgen -struct=Metrics + +type Metrics struct { + // Height of the chain. + // We expect multi-line comments to parse correctly. + Field metrics.Gauge +} diff --git a/scripts/metricsgen/testdata/tags/metrics.gen.go b/scripts/metricsgen/testdata/tags/metrics.gen.go new file mode 100644 index 000000000..7ac292d3c --- /dev/null +++ b/scripts/metricsgen/testdata/tags/metrics.gen.go @@ -0,0 +1,54 @@ +// Code generated by metricsgen. DO NOT EDIT. + +package tags + +import ( + "github.com/go-kit/kit/metrics/discard" + prometheus "github.com/go-kit/kit/metrics/prometheus" + stdprometheus "github.com/prometheus/client_golang/prometheus" +) + +func PrometheusMetrics(namespace string, labelsAndValues ...string) *Metrics { + labels := []string{} + for i := 0; i < len(labelsAndValues); i += 2 { + labels = append(labels, labelsAndValues[i]) + } + return &Metrics{ + WithLabels: prometheus.NewCounterFrom(stdprometheus.CounterOpts{ + Namespace: namespace, + Subsystem: MetricsSubsystem, + Name: "with_labels", + Help: "", + }, append(labels, "step,time")).With(labelsAndValues...), WithExpBuckets: prometheus.NewHistogramFrom(stdprometheus.HistogramOpts{ + Namespace: namespace, + Subsystem: MetricsSubsystem, + Name: "with_exp_buckets", + Help: "", + + Buckets: stdprometheus.ExponentialBuckets(.1, 100, 8), + }, labels).With(labelsAndValues...), + WithBuckets: prometheus.NewHistogramFrom(stdprometheus.HistogramOpts{ + Namespace: namespace, + Subsystem: MetricsSubsystem, + Name: "with_buckets", + Help: "", + + Buckets: []float64{1, 2, 3, 4, 5}, + }, labels).With(labelsAndValues...), + Named: prometheus.NewCounterFrom(stdprometheus.CounterOpts{ + Namespace: namespace, + Subsystem: MetricsSubsystem, + Name: "metric_with_name", + Help: "", + }, labels).With(labelsAndValues...), + } +} + +func NopMetrics() *Metrics { + return &Metrics{ + WithLabels: discard.NewCounter(), + WithExpBuckets: discard.NewHistogram(), + WithBuckets: discard.NewHistogram(), + Named: discard.NewCounter(), + } +} diff --git a/scripts/metricsgen/testdata/tags/metrics.go b/scripts/metricsgen/testdata/tags/metrics.go new file mode 100644 index 000000000..8562dcf43 --- /dev/null +++ b/scripts/metricsgen/testdata/tags/metrics.go @@ -0,0 +1,12 @@ +package tags + +import "github.com/go-kit/kit/metrics" + +//go:generate go run ../../../../scripts/metricsgen -struct=Metrics + +type Metrics struct { + WithLabels metrics.Counter `metrics_labels:"step,time"` + WithExpBuckets metrics.Histogram `metrics_buckettype:"exp" metrics_bucketsizes:".1,100,8"` + WithBuckets metrics.Histogram `metrics_bucketsizes:"1, 2, 3, 4, 5"` + Named metrics.Counter `metrics_name:"metric_with_name"` +}