From cec0a97987860bd753ae1b2d92963cb96afe6d7c Mon Sep 17 00:00:00 2001 From: Sergio Mena Date: Mon, 9 May 2022 02:15:08 +0200 Subject: [PATCH 1/5] RFC017: ABCI++ Vote Extension Propagation (#8317) * 1st version * Addressed (some of) @williambanfield's comments * Update docs/rfc/rfc-017-abci++-vote-extension-propag.md Co-authored-by: Daniel * Update docs/rfc/rfc-017-abci++-vote-extension-propag.md Co-authored-by: Daniel * Update docs/rfc/rfc-017-abci++-vote-extension-propag.md Co-authored-by: Daniel * Update docs/rfc/rfc-017-abci++-vote-extension-propag.md Co-authored-by: Sam Kleinman * Update docs/rfc/README.md Co-authored-by: Sam Kleinman * Addressed some comments * Addressed more comments. Improved description of Solution 3 * Work on 'definitions' section * Update docs/rfc/rfc-017-abci++-vote-extension-propag.md Co-authored-by: Callum Waters * bottom * Addressed Josef's valset-change comment. Other minor edits * Improved wording of 'disjoint valsets' case * Addressed TODOs: major revamp of various sections. First complete version. * Fixed minor wording problem * removed blank line * Update docs/rfc/rfc-017-abci++-vote-extension-propag.md Co-authored-by: Thane Thomson * Update docs/rfc/rfc-017-abci++-vote-extension-propag.md Co-authored-by: Thane Thomson * Addressed some of Thane's comments * Update docs/rfc/rfc-017-abci++-vote-extension-propag.md Co-authored-by: Thane Thomson * Update docs/rfc/rfc-017-abci++-vote-extension-propag.md Co-authored-by: Thane Thomson * Addressed outstanding comments * Addressed @williambanfield's 'catch-up message' comment * Removed TODO after confirming statesync is only run on nodes starting from scratch * Removed TODO (after checking with Jasmina) * Removed addressed TODO * Addressed Josef's feedback on case (h) * Typo * Update docs/rfc/rfc-017-abci++-vote-extension-propag.md Co-authored-by: Josef Widder <44643235+josef-widder@users.noreply.github.com> * Added log line Co-authored-by: Daniel Co-authored-by: Sam Kleinman Co-authored-by: Callum Waters Co-authored-by: Thane Thomson Co-authored-by: Josef Widder <44643235+josef-widder@users.noreply.github.com> --- docs/rfc/README.md | 3 + .../rfc-017-abci++-vote-extension-propag.md | 571 ++++++++++++++++++ 2 files changed, 574 insertions(+) create mode 100644 docs/rfc/rfc-017-abci++-vote-extension-propag.md diff --git a/docs/rfc/README.md b/docs/rfc/README.md index 6b03cf21a..a83d802b8 100644 --- a/docs/rfc/README.md +++ b/docs/rfc/README.md @@ -56,4 +56,7 @@ sections. - [RFC-018: BLS Signature Aggregation Exploration](./rfc-018-bls-agg-exploration.md) - [RFC-019: Configuration File Versioning](./rfc-019-config-version.md) + +- [RFC-017: ABCI++ Vote Extension Propagation](./rfc-017-abci++-vote-extension-propag.md) + diff --git a/docs/rfc/rfc-017-abci++-vote-extension-propag.md b/docs/rfc/rfc-017-abci++-vote-extension-propag.md new file mode 100644 index 000000000..15d08f7ba --- /dev/null +++ b/docs/rfc/rfc-017-abci++-vote-extension-propag.md @@ -0,0 +1,571 @@ +# RFC 017: ABCI++ Vote Extension Propagation + +## Changelog + +- 11-Apr-2022: Initial draft (@sergio-mena). +- 15-Apr-2022: Addressed initial comments. First complete version (@sergio-mena). +- 09-May-2022: Addressed all outstanding comments. + +## Abstract + +According to the +[ABCI++ specification](https://github.com/tendermint/tendermint/blob/4743a7ad0/spec/abci%2B%2B/README.md) +(as of 11-Apr-2022), a validator MUST provide a signed vote extension for each non-`nil` precommit vote +of height *h* that it uses to propose a block in height *h+1*. When a validator is up to +date, this is easy to do, but when a validator needs to catch up this is far from trivial as this data +cannot be retrieved from the blockchain. + +This RFC presents and compares the different options to address this problem, which have been proposed +in several discussions by the Tendermint Core team. + +## Document Structure + +The RFC is structured as follows. In the [Background](#background) section, +subsections [Problem Description](#problem-description) and [Cases to Address](#cases-to-address) +explain the problem at hand from a high level perspective, i.e., abstracting away from the current +Tendermint implementation. In contrast, subsection +[Current Catch-up Mechanisms](#current-catch-up-mechanisms) delves into the details of the current +Tendermint code. + +In the [Discussion](#discussion) section, subsection [Solutions Proposed](#solutions-proposed) is also +worded abstracting away from implementation details, whilst subsections +[Feasibility of the Proposed Solutions](#feasibility-of-the-proposed-solutions) and +[Current Limitations and Possible Implementations](#current-limitations-and-possible-implementations) +analize the viability of one of the proposed solutions in the context of Tendermint's architecture +based on reactors. Finally, [Formalization Work](#formalization-work) briefly discusses the work +still needed demonstrate the correctness of the chosen solution. + +The high level subsections are aimed at readers who are familiar with consensus algorithms, in +particular with the one described in the Tendermint (white paper), but who are not necessarily +acquainted with the details of the Tendermint codebase. The other subsections, which go into +implementation details, are best understood by engineers with deep knowledge of the implementation of +Tendermint's blocksync and consensus reactors. + +## Background + +### Basic Definitions + +This document assumes that all validators have equal voting power for the sake of simplicity. This is done +without loss of generality. + +There are two types of votes in Tendermint: *prevotes* and *precommits*. Votes can be `nil` or refer to +a proposed block. This RFC focuses on precommits, +also known as *precommit votes*. In this document we sometimes call them simply *votes*. + +Validators send precommit votes to their peer nodes in *precommit messages*. According to the +[ABCI++ specification](https://github.com/tendermint/tendermint/blob/4743a7ad0/spec/abci%2B%2B/README.md), +a precommit message MUST also contain a *vote extension*. +This mandatory vote extension can be empty, but MUST be signed with the same key as the precommit +vote (i.e., the sending validator's). +Nevertheless, the vote extension is signed independently from the vote, so a vote can be separated from +its extension. +The reason for vote extensions to be mandatory in precommit messages is that, otherwise, a (malicious) +node can omit a vote extension while still providing/forwarding/sending the corresponding precommit vote. + +The validator set at height *h* is denoted *valseth*. A *commit* for height *h* consists of more +than *2nh/3* precommit votes voting for a block *b*, where *nh* denotes the size of +*valseth*. A commit does not contain `nil` precommit votes, and all votes in it refer to the +same block. An *extended commit* is a *commit* where every precommit vote has its respective vote extension +attached. + +### Problem Description + +In the version of [ABCI](https://github.com/tendermint/spec/blob/4fb99af/spec/abci/README.md) present up to +Tendermint v0.35, for any height *h*, a validator *v* MUST have the decided block *b* and a commit for +height *h* in order to decide at height *h*. Then, *v* just needs a commit for height *h* to propose at +height *h+1*, in the rounds of *h+1* where *v* is a proposer. + +In [ABCI++](https://github.com/tendermint/tendermint/blob/4743a7ad0/spec/abci%2B%2B/README.md), +the information that a validator *v* MUST have to be able to decide in *h* does not change with +respect to pre-existing ABCI: the decided block *b* and a commit for *h*. +In contrast, for proposing in *h+1*, a commit for *h* is not enough: *v* MUST now have an extended +commit. + +When a validator takes an active part in consensus at height *h*, it has all the data it needs in memory, +in its consensus state, to decide on *h* and propose in *h+1*. Things are not so easy in the cases when +*v* cannot take part in consensus because it is late (e.g., it falls behind, it crashes +and recovers, or it just starts after the others). If *v* does not take part, it cannot actively +gather precommit messages (which include vote extensions) in order to decide. +Before ABCI++, this was not a problem: full nodes are supposed to persist past blocks in the block store, +so other nodes would realise that *v* is late and send it the missing decided block at height *h* and +the corresponding commit (kept in block *h+1*) so that *v* can catch up. +However, we cannot apply this catch-up technique for ABCI++, as the vote extensions, which are part +of the needed *extended commit* are not part of the blockchain. + +### Cases to Address + +Before we tackle the description of the possible cases we need to address, let us describe the following +incremental improvement to the ABCI++ logic. Upon decision, a full node persists (e.g., in the block +store) the extended commit that allowed the node to decide. For the moment, let us assume the node only +needs to keep its *most recent* extended commit, and MAY remove any older extended commits from persistent +storage. +This improvement is so obvious that all solutions described in the [Discussion](#discussion) section use +it as a building block. Moreover, it completely addresses by itself some of the cases described in this +subsection. + +We now describe the cases (i.e. possible *runs* of the system) that have been raised in different +discussions and need to be addressed. They are (roughly) ordered from easiest to hardest to deal with. + +- **(a)** *Happy path: all validators advance together, no crash*. + + This case is included for completeness. All validators have taken part in height *h*. + Even if some of them did not manage to send a precommit message for the decided block, they all + receive enough precommit messages to be able to decide. As vote extensions are mandatory in + precommit messages, every validator *v* trivially has all the information, namely the decided block + and the extended commit, needed to propose in height *h+1* for the rounds in which *v* is the + proposer. + + No problem to solve here. + +- **(b)** *All validators advance together, then all crash at the same height*. + + This case has been raised in some discussions, the main concern being whether the vote extensions + for the previous height would be lost across the network. With the improvement described above, + namely persisting the latest extended commit at decision time, this case is solved. + When a crashed validator recovers, it recovers the last extended commit from persistent storage + and handshakes with the Application. + If need be, it also reconstructs messages for the unfinished height + (including all precommits received) from the WAL. + Then, the validator can resume where it was at the time of the crash. Thus, as extensions are + persisted, either in the WAL (in the form of received precommit messages), or in the latest + extended commit, the only way that vote extensions needed to start the next height could be lost + forever would be if all validators crashed and never recovered (e.g. disk corruption). + Since a *correct* node MUST eventually recover, this violates Tendermint's assumption of more than + *2nh/3* correct validators for every height *h*. + + No problem to solve here. + +- **(c)** *Lagging majority*. + + Let us assume the validator set does not change between *h* and *h+1*. + It is not possible by the nature of the Tendermint algorithm, which requires more + than *2nh/3* precommit votes for some round of height *h* in order to make progress. + So, only up to *nh/3* validators can lag behind. + + On the other hand, for the case where there are changes to the validator set between *h* and + *h+1* please see case (d) below, where the extreme case is discussed. + +- **(d)** *Validator set changes completely between* h *and* h+1. + + If sets *valseth* and *valseth+1* are disjoint, + more than *2nh/3* of validators in height *h* should + have actively participated in conensus in *h*. So, as of height *h*, only a minority of validators + in *h* can be lagging behind, although they could all lag behind from *h+1* on, as they are no + longer validators, only full nodes. This situation falls under the assumptions of case (h) below. + + As for validators in *valseth+1*, as they were not validators as of height *h*, they + could all be lagging behind by that time. However, by the time *h* finishes and *h+1* begins, the + chain will halt until more than *2nh+1/3* of them have caught up and started consensus + at height *h+1*. If set *valseth+1* does not change in *h+2* and subsequent + heights, only up to *nh+1/3* validators will be able to lag behind. Thus, we have + converted this case into case (h) below. + +- **(e)** *Enough validators crash to block the rest*. + + In this case, blockchain progress halts, i.e. surviving full nodes keep increasing rounds + indefinitely, until some of the crashed validators are able to recover. + Those validators that recover first will handshake with the Application and recover at the height + they crashed, which is still the same the nodes that did not crash are stuck in, so they don't need + to catch up. + Further, they had persisted the extended commit for the previous height. Nothing to solve. + + For those validators recovering later, we are in case (h) below. + +- **(f)** *Some validators crash, but not enough to block progress*. + + When the correct processes that crashed recover, they handshake with the Application and resume at + the height they were at when they crashed. As the blockchain did not stop making progress, the + recovered processes are likely to have fallen behind with respect to the progressing majority. + + At this point, the recovered processes are in case (h) below. + +- **(g)** *A new full node starts*. + + The reasoning here also applies to the case when more than one full node are starting. + When the full node starts from scratch, it has no state (its current height is 0). Ignoring + statesync for the time being, the node just needs to catch up by applying past blocks one by one + (after verifying them). + + Thus, the node is in case (h) below. + +- **(h)** *Advancing majority, lagging minority* + + In this case, some nodes are late. More precisely, at the present time, a set of full nodes, + denoted *Lhp*, are falling behind + (e.g., temporary disconnection or network partition, memory thrashing, crashes, new nodes) + an arbitrary + number of heights: + between *hs* and *hp*, where *hs < hp*, and + *hp* is the highest height + any correct full node has reached so far. + + The correct full nodes that reached *hp* were able to decide for *hp-1*. + Therefore, less than *nhp-1/3* validators of *hp-1* can be part + of *Lhp*, since enough up-to-date validators needed to actively participate + in consensus for *hp-1*. + + Since, at the present time, + no node in *Lhp* took part in any consensus between + *hs* and *hp-1*, + the reasoning above can be extended to validator set changes between *hs* and + *hp-1*. This results in the following restriction on the full nodes that can be part of *Lhp*. + + - ∀ *h*, where *hs ≤ h < hp*, + | *valseth* ∩ *Lhp* | *< nh/3* + + If this property does not hold for a particular height *h*, where + *hs ≤ h < hp*, Tendermint could not have progressed beyond *h* and + therefore no full node could have reached *hp* (a contradiction). + + These lagging nodes in *Lhp* need to catch up. They have to obtain the + information needed to make + progress from other nodes. For each height *h* between *hs* and *hp-2*, + this includes the decided block for *h*, and the + precommit votes also for *deciding h* (which can be extracted from the block at height *h+1*). + + At a given height *hc* (where possibly *hc << hp*), + a full node in *Lhp* will consider itself *caught up*, based on the + (maybe out of date) information it is getting from its peers. Then, the node needs to be ready to + propose at height *hc+1*, which requires having received the vote extensions for + *hc*. + As the vote extensions are *not* stored in the blocks, and it is difficult to have strong + guarantees on *when* a late node considers itself caught up, providing the late node with the right + vote extensions for the right height poses a problem. + +At this point, we have described and compared all cases raised in discussions leading up to this +RFC. The list above aims at being exhaustive. The analysis of each case included above makes all of +them converge into case (h). + +### Current Catch-up Mechanisms + +We now briefly describe the current catch-up mechanisms in the reactors concerned in Tendermint. + +#### Statesync + +Full nodes optionally run statesync just after starting, when they start from scratch. +If statesync succeeds, an Application snapshot is installed, and Tendermint jumps from height 0 directly +to the height the Application snapshop represents, without applying the block of any previous height. +Some light blocks are received and stored in the block store for running light-client verification of +all the skipped blocks. Light blocks are incomplete blocks, typically containing the header and the +canonical commit but, e.g., no transactions. They are stored in the block store as "signed headers". + +The statesync reactor is not really relevant for solving the problem discussed in this RFC. We will +nevertheless mention it when needed; in particular, to understand some corner cases. + +#### Blocksync + +The blocksync reactor kicks in after start up or recovery (and, optionally, after statesync is done) +and sends the following messages to its peers: + +- `StatusRequest` to query the height its peers are currently at, and +- `BlockRequest`, asking for blocks of heights the local node is missing. + +Using `BlockResponse` messages received from peers, the blocksync reactor validates each received +block using the block of the following height, saves the block in the block store, and sends the +block to the Application for execution. + +If blocksync has validated and applied the block for the height *previous* to the highest seen in +a `StatusResponse` message, or if no progress has been made after a timeout, the node considers +itself as caught up and switches to the consensus reactor. + +#### Consensus Reactor + +The consensus reactor runs the full Tendermint algorithm. For a validator this means it has to +propose blocks, and send/receive prevote/precommit messages, as mandated by Tendermint, before it can +decide and move on to the next height. + +If a full node that is running the consensus reactor falls behind at height *h*, when a peer node +realises this it will retrieve the canonical commit of *h+1* from the block store, and *convert* +it into a set of precommit votes and will send those to the late node. + +## Discussion + +### Solutions Proposed + +These are the solutions proposed in discussions leading up to this RFC. + +- **Solution 0.** *Vote extensions are made **best effort** in the specification*. + + This is the simplest solution, considered as a way to provide vote extensions in a simple enough + way so that it can be part of v0.36. + It consists in changing the specification so as to not *require* that precommit votes used upon + `PrepareProposal` contain their corresponding vote extensions. In other words, we render vote + extensions optional. + There are strong implications stemming from such a relaxation of the original specification. + + - As a vote extension is signed *separately* from the vote it is extending, an intermediate node + can now remove (i.e., censor) vote extensions from precommit messages at will. + - Further, there is no point anymore in the spec requiring the Application to accept a vote extension + passed via `VerifyVoteExtension` to consider a precommit message valid in its entirety. Remember + this behavior of `VerifyVoteExtension` is adding a constraint to Tendermint's conditions for + liveness. + In this situation, it is better and simpler to just drop the vote extension rejected by the + Application via `VerifyVoteExtension`, but still consider the precommit vote itself valid as long + as its signature verifies. + +- **Solution 1.** *Include vote extensions in the blockchain*. + + Another obvious solution, which has somehow been considered in the past, is to include the vote + extensions and their signatures in the blockchain. + The blockchain would thus include the extended commit, rather than a regular commit, as the structure + to be canonicalized in the next block. + With this solution, the current mechanisms implemented both in the blocksync and consensus reactors + would still be correct, as all the information a node needs to catch up, and to start proposing when + it considers itself as caught-up, can now be recovered from past blocks saved in the block store. + + This solution has two main drawbacks. + + - As the block format must change, upgrading a chain requires a hard fork. Furthermore, + all existing light client implementations will stop working until they are upgraded to deal with + the new format (e.g., how certain hashes calculated and/or how certain signatures are checked). + For instance, let us consider IBC, which relies on light clients. An IBC connection between + two chains will be broken if only one chain upgrades. + - The extra information (i.e., the vote extensions) that is now kept in the blockchain is not really + needed *at every height* for a late node to catch up. + - This information is only needed to be able to *propose* at the height the validator considers + itself as caught-up. If a validator is indeed late for height *h*, it is useless (although + correct) for it to call `PrepareProposal`, or `ExtendVote`, since the block is already decided. + - Moreover, some use cases require pretty sizeable vote extensions, which would result in an + important waste of space in the blockchain. + +- **Solution 2.** *Skip* propose *step in Tendermint algorithm*. + + This solution consists in modifying the Tendermint algorithm to skip the *send proposal* step in + heights where the node does not have the required vote extensions to populate the call to + `PrepareProposal`. The main idea behind this is that it should only happen when the validator is late + and, therefore, up-to-date validators have already proposed (and decided) for that height. + A small variation of this solution is, rather than skipping the *send proposal* step, the validator + sends a special *empty* or *bottom* (⊥) proposal to signal other nodes that it is not ready to propose + at (any round of) the current height. + + The appeal of this solution is its simplicity. A possible implementation does not need to extend + the data structures, or change the current catch-up mechanisms implemented in the blocksync or + in the consensus reactor. When we lack the needed information (vote extensions), we simply rely + on another correct validator to propose a valid block in other rounds of the current height. + + However, this solution can be attacked by a byzantine node in the network in the following way. + Let us consider the following scenario: + + - all validators in *valseth* send out precommit messages, with vote extensions, + for height *h*, round 0, roughly at the same time, + - all those precommit messages contain non-`nil` precommit votes, which vote for block *b* + - all those precommit messages sent in height *h*, round 0, and all messages sent in + height *h*, round *r > 0* get delayed indefinitely, so, + - all validators in *valseth* keep waiting for enough precommit + messages for height *h*, round 0, needed for deciding in height *h* + - an intermediate (malicious) full node *m* manages to receive block *b*, and gather more than + *2nh/3* precommit messages for height *h*, round 0, + - one way or another, the solution should have either (a) a mechanism for a full node to *tell* + another full node it is late, or (b) a mechanism for a full node to conclude it is late based + on other full nodes' messages; any of these mechanisms should, at the very least, + require the late node receiving the decided block and a commit (not necessarily an extended + commit) for *h*, + - node *m* uses the gathered precommit messages to build a commit for height *h*, round 0, + - in order to convince full nodes that they are late, node *m* either (a) *tells* them they + are late, or (b) shows them it (i.e. *m*) is ahead, by sending them block *b*, along with the + commit for height *h*, round 0, + - all full nodes conclude they are late from *m*'s behavior, and use block *b* and the commit for + height *h*, round 0, to decide on height *h*, and proceed to height *h+1*. + + At this point, *all* full nodes, including all validators in *valseth+1*, have advanced + to height *h+1* believing they are late, and so, expecting the *hypothetical* leading majority of + validators in *valseth+1* to propose for *h+1*. As a result, the blockhain + grinds to a halt. + A (rather complex) ad-hoc mechanism would need to be carried out by node operators to roll + back all validators to the precommit step of height *h*, round *r*, so that they can regenerate + vote extensions (remember vote extensions are non-deterministic) and continue execution. + +- **Solution 3.** *Require extended commits to be available at switching time*. + + This one is more involved than all previous solutions, and builds on an idea present in Solution 2: + vote extensions are actually not needed for Tendermint to make progress as long as the + validator is *certain* it is late. + + We define two modes. The first is denoted *catch-up mode*, and Tendermint only calls + `FinalizeBlock` for each height when in this mode. The second is denoted *consensus mode*, in + which the validator considers itself up to date and fully participates in consensus and calls + `PrepareProposal`/`ProcessProposal`, `ExtendVote`, and `VerifyVoteExtension`, before calling + `FinalizeBlock`. + + The catch-up mode does not need vote extension information to make progress, as all it needs is the + decided block at each height to call `FinalizeBlock` and keep the state-machine replication making + progress. The consensus mode, on the other hand, does need vote extension information when + starting every height. + + Validators are in consensus mode by default. When a validator in consensus mode falls behind + for whatever reason, e.g. cases (b), (d), (e), (f), (g), or (h) above, we introduce the following + key safety property: + + - for every height *hp*, a full node *f* in *hp* refuses to switch to catch-up + mode **until** there exists a height *h'* such that: + - *p* has received and (light-client) verified the blocks of + all heights *h*, where *hp ≤ h ≤ h'* + - it has received an extended commit for *h'* and has verified: + - the precommit vote signatures in the extended commit + - the vote extension signatures in the extended commit: each is signed with the same + key as the precommit vote it extends + + If the condition above holds for *hp*, namely receiving a valid sequence of blocks in + the *f*'s future, and an extended commit corresponding to the last block in the sequence, then + node *f*: + + - switches to catch-up mode, + - applies all blocks between *hp* and *h'* (calling `FinalizeBlock` only), and + - switches back to consensus mode using the extended commit for *h'* to propose in the rounds of + *h' + 1* where it is the proposer. + + This mechanism, together with the invariant it uses, ensures that the node cannot be attacked by + being fed a block without extensions to make it believe it is late, in a similar way as explained + for Solution 2. + +### Feasibility of the Proposed Solutions + +Solution 0, besides the drawbacks described in the previous section, provides guarantees that are +weaker than the rest. The Application does not have the assurance that more than *2nh/3* vote +extensions will *always* be available when calling `PrepareProposal` at height *h+1*. +This level of guarantees is probably not strong enough for vote extensions to be useful for some +important use cases that motivated them in the first place, e.g., encrypted mempool transactions. + +Solution 1, while being simple in that the changes needed in the current Tendermint codebase would +be rather small, is changing the block format, and would therefore require all blockchains using +Tendermint v0.35 or earlier to hard-fork when upgrading to v0.36. + +Since Solution 2 can be attacked, one might prefer Solution 3, even if it is more involved +to implement. Further, we must elaborate on how we can turn Solution 3, described in abstract +terms in the previous section, into a concrete implementation compatible with the current +Tendermint codebase. + +### Current Limitations and Possible Implementations + +The main limitations affecting the current version of Tendermint are the following. + +- The current version of the blocksync reactor does not use the full + [light client verification](https://github.com/tendermint/tendermint/blob/4743a7ad0/spec/light-client/README.md) + algorithm to validate blocks coming from other peers. +- The code being structured into the blocksync and consensus reactors, only switching from the + blocksync reactor to the consensus reactor is supported; switching in the opposite direction is + not supported. Alternatively, the consensus reactor could have a mechanism allowing a late node + to catch up by skipping calls to `PrepareProposal`/`ProcessProposal`, and + `ExtendVote`/`VerifyVoteExtension` and only calling `FinalizeBlock` for each height. + Such a mechanism does not exist at the time of writing this RFC. + +The blocksync reactor featuring light client verification is being actively worked on (tentatively +for v0.37). So it is best if this RFC does not try to delve into that problem, but just makes sure +its outcomes are compatible with that effort. + +In subsection [Cases to Address](#cases-to-address), we concluded that we can focus on +solving case (h) in theoretical terms. +However, as the current Tendermint version does not yet support switching back to blocksync once a +node has switched to consensus, we need to split case (h) into two cases. When a full node needs to +catch up... + +- **(h.1)** ... it has not switched yet from the blocksync reactor to the consensus reactor, or + +- **(h.2)** ... it has already switched to the consensus reactor. + +This is important in order to discuss the different possible implementations. + +#### Base Implementation: Persist and Propagate Extended Commit History + +In order to circumvent the fact that we cannot switch from the consensus reactor back to blocksync, +rather than just keeping the few most recent extended commits, nodes will need to keep +and gossip a backlog of extended commits so that the consensus reactor can still propose and decide +in out-of-date heights (even if those proposals will be useless). + +The base implementation - for which an experimental patch exists - consists in the conservative +approach of persisting in the block store *all* extended commits for which we have also stored +the full block. Currently, when statesync is run at startup, it saves light blocks. +This base implementation does not seek +to receive or persist extended commits for those light blocks as they would not be of any use. + +Then, we modify the blocksync reactor so that peers *always* send requested full blocks together +with the corresponding extended commit in the `BlockResponse` messages. This guarantees that the +block store being reconstructed by blocksync has the same information as that of peers that are +up to date (at least starting from the latest snapshot applied by statesync before starting blocksync). +Thus, blocksync has all the data it requires to switch to the consensus reactor, as long as one of +the following exit conditions are met: + +- The node is still at height 0 (where no commit or extended commit is needed) +- The node has processed at least 1 block in blocksync + +The second condition is needed in case the node has installed an Application snapshot during statesync. +If that is the case, at the time blocksync starts, the block store only has the data statesync has saved: +light blocks, and no extended commits. +Hence we need to blocksync at least one block from another node, which will be sent with its corresponding extended commit, before we can switch to consensus. + +As a side note, a chain might be started at a height *hi > 0*, all other heights +*h < hi* being non-existent. In this case, the chain is still considered to be at height 0 before +block *hi* is applied, so the first condition above allows the node to switch to consensus even +if blocksync has not processed any block (which is always the case if all nodes are starting from scratch). + +When a validator falls behind while having already switched to the consensus reactor, a peer node can +simply retrieve the extended commit for the required height from the block store and reconstruct a set of +precommit votes together with their extensions and send them in the form of precommit messages to the +validator falling behind, regardless of whether the peer node holds the extended commit because it +actually participated in that consensus and thus received the precommit messages, or it received the extended commit via a `BlockResponse` message while running blocksync. + +This solution requires a few changes to the consensus reactor: + +- upon saving the block for a given height in the block store at decision time, save the + corresponding extended commit as well +- in the catch-up mechanism, when a node realizes that another peer is more than 2 heights + behind, it uses the extended commit (rather than the canoncial commit as done previously) to + reconstruct the precommit votes with their corresponding extensions + +The changes to the blocksync reactor are more substantial: + +- the `BlockResponse` message is extended to include the extended commit of the same height as + the block included in the response (just as they are stored in the block store) +- structure `bpRequester` is likewise extended to hold the received extended commits coming in + `BlockResponse` messages +- method `PeekTwoBlocks` is modified to also return the extended commit corresponding to the first block +- when successfully verifying a received block, the reactor saves its corresponding extended commit in + the block store + +The two main drawbacks of this base implementation are: + +- the increased size taken by the block store, in particular with big extensions +- the increased bandwith taken by the new format of `BlockResponse` + +#### Possible Optimization: Pruning the Extended Commit History + +If we cannot switch from the consensus reactor back to the blocksync reactor we cannot prune the extended commit backlog in the block store without sacrificing the implementation's correctness. The asynchronous +nature of our distributed system model allows a process to fall behing an arbitrary number of +heights, and thus all extended commits need to be kept *just in case* a node that late had +previously switched to the consensus reactor. + +However, there is a possibility to optimize the base implementation. Every time we enter a new height, +we could prune from the block store all extended commits that are more than *d* heights in the past. +Then, we need to handle two new situations, roughly equivalent to cases (h.1) and (h.2) described above. + +- (h.1) A node starts from scratch or recovers after a crash. In thisy case, we need to modify the + blocksync reactor's base implementation. + - when receiving a `BlockResponse` message, it MUST accept that the extended commit set to `nil`, + - when sending a `BlockResponse` message, if the block store contains the extended commit for that + height, it MUST set it in the message, otherwise it sets it to `nil`, + - the exit conditions used for the base implementation are no longer valid; the only reliable exit + condition now consists in making sure that the last block processed by blocksync was received with + the corresponding commit, and not `nil`; this extended commit will allow the node to switch from + the blocksync reactor to the consensus reactor and immediately act as a proposer if required. +- (h.2) A node already running the consensus reactor falls behind beyond *d* heights. In principle, + the node will be stuck forever as no other node can provide the vote extensions it needs to make + progress (they all have pruned the corresponding extended commit). + However we can manually have the node crash and recover as a workaround. This effectively converts + this case into (h.1). + +### Formalization Work + +A formalization work to show or prove the correctness of the different use cases and solutions +presented here (and any other that may be found) needs to be carried out. +A question that needs a precise answer is how many extended commits (one?, two?) a node needs +to keep in persistent memory when implementing Solution 3 described above without Tendermint's +current limitations. +Another important invariant we need to prove formally is that the set of vote extensions +required to make progress will always be held somewhere in the network. + +## References + +- [ABCI++ specification](https://github.com/tendermint/tendermint/blob/4743a7ad0/spec/abci%2B%2B/README.md) +- [ABCI as of v0.35](https://github.com/tendermint/spec/blob/4fb99af/spec/abci/README.md) +- [Vote extensions issue](https://github.com/tendermint/tendermint/issues/8174) +- [Light client verification](https://github.com/tendermint/tendermint/blob/4743a7ad0/spec/light-client/README.md) From 494c5cddbe6cc3364708e4e5e6559da1beefdec5 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 9 May 2022 14:31:59 +0000 Subject: [PATCH 2/5] build(deps): Bump docker/setup-buildx-action from 1.7.0 to 2.0.0 (#8483) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bumps [docker/setup-buildx-action](https://github.com/docker/setup-buildx-action) from 1.7.0 to 2.0.0.
Release notes

Sourced from docker/setup-buildx-action's releases.

v2.0.0

  • Node 16 as default runtime by @​crazy-max (#131)
    • This requires a minimum Actions Runner version of v2.285.0, which is by default available in GHES 3.4 or later.

Full Changelog: https://github.com/docker/setup-buildx-action/compare/v1.7.0...v2.0.0

Commits

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=docker/setup-buildx-action&package-manager=github_actions&previous-version=1.7.0&new-version=2.0.0)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@dependabot rebase` will rebase this PR - `@dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@dependabot merge` will merge this PR after your CI passes on it - `@dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@dependabot cancel merge` will cancel a previously requested merge and block automerging - `@dependabot reopen` will reopen this PR if it is closed - `@dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
--- .github/workflows/docker.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index 524df1ef8..1c16133e0 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -39,7 +39,7 @@ jobs: platforms: all - name: Set up Docker Build - uses: docker/setup-buildx-action@v1.7.0 + uses: docker/setup-buildx-action@v2.0.0 - name: Login to DockerHub if: ${{ github.event_name != 'pull_request' }} From 083716b22a8d02b332950655634dd5bc9a7afd71 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 9 May 2022 14:33:45 +0000 Subject: [PATCH 3/5] build(deps): Bump docker/build-push-action from 2.10.0 to 3.0.0 (#8482) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bumps [docker/build-push-action](https://github.com/docker/build-push-action) from 2.10.0 to 3.0.0.
Release notes

Sourced from docker/build-push-action's releases.

v3.0.0

  • Node 16 as default runtime by @​crazy-max (#564)
    • This requires a minimum Actions Runner version of v2.285.0, which is by default available in GHES 3.4 or later.
  • Standalone mode support by @​crazy-max (#601 #609)
  • chore: update dev dependencies and workflow by @​crazy-max (#571)
  • Bump @​actions/exec from 1.1.0 to 1.1.1 (#573)
  • Bump @​actions/github from 5.0.0 to 5.0.1 (#582)
  • Bump minimist from 1.2.5 to 1.2.6 (#584)
  • Bump semver from 7.3.5 to 7.3.7 (#595)
  • Bump csv-parse from 4.16.3 to 5.0.4 (#533)

Full Changelog: https://github.com/docker/build-push-action/compare/v2.10.0...v3.0.0

Commits
  • e551b19 Merge pull request #564 from crazy-max/node-16
  • 3554377 Merge pull request #609 from crazy-max/ci-fix-test
  • a62bc1b ci: fix standalone test
  • c208583 Merge pull request #601 from crazy-max/standalone-mode
  • fcd9124 Merge pull request #607 from docker/dependabot/github_actions/docker/metadata...
  • 0ebe720 Bump docker/metadata-action from 3 to 4
  • 38b4580 Standalone mode support
  • ba31738 Merge pull request #533 from docker/dependabot/npm_and_yarn/csv-parse-5.0.4
  • 43721d2 Update generated content
  • 5ea21bf Fix csv-parse implementation since major update
  • Additional commits viewable in compare view

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=docker/build-push-action&package-manager=github_actions&previous-version=2.10.0&new-version=3.0.0)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@dependabot rebase` will rebase this PR - `@dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@dependabot merge` will merge this PR after your CI passes on it - `@dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@dependabot cancel merge` will cancel a previously requested merge and block automerging - `@dependabot reopen` will reopen this PR if it is closed - `@dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
--- .github/workflows/docker.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index 1c16133e0..7547e8591 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -49,7 +49,7 @@ jobs: password: ${{ secrets.DOCKERHUB_TOKEN }} - name: Publish to Docker Hub - uses: docker/build-push-action@v2.10.0 + uses: docker/build-push-action@v3.0.0 with: context: . file: ./DOCKER/Dockerfile From b52b8f2740bf075e2a917784227a0c4ff4b9f470 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 9 May 2022 14:35:43 +0000 Subject: [PATCH 4/5] build(deps): Bump docker/login-action from 1.14.1 to 2.0.0 (#8481) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bumps [docker/login-action](https://github.com/docker/login-action) from 1.14.1 to 2.0.0.
Release notes

Sourced from docker/login-action's releases.

v2.0.0

  • Node 16 as default runtime by @​crazy-max (#161)
    • This requires a minimum Actions Runner version of v2.285.0, which is by default available in GHES 3.4 or later.
  • chore: update dev dependencies and workflow by @​crazy-max (#170)
  • Bump @​actions/exec from 1.1.0 to 1.1.1 (#167)
  • Bump @​actions/io from 1.1.1 to 1.1.2 (#168)
  • Bump minimist from 1.2.5 to 1.2.6 (#176)
  • Bump https-proxy-agent from 5.0.0 to 5.0.1 (#182)

Full Changelog: https://github.com/docker/login-action/compare/v1.14.1...v2.0.0

Commits
  • 49ed152 Merge pull request #161 from crazy-max/node16-runtime
  • b61a9ce Node 16 as default runtime
  • 3a136a8 Merge pull request #182 from docker/dependabot/npm_and_yarn/https-proxy-agent...
  • b312880 Update generated content
  • 795794e Bump https-proxy-agent from 5.0.0 to 5.0.1
  • 1edf618 Merge pull request #179 from docker/dependabot/github_actions/codecov/codecov...
  • 8e66ad4 Bump codecov/codecov-action from 2 to 3
  • 7c79b59 Merge pull request #176 from docker/dependabot/npm_and_yarn/minimist-1.2.6
  • 24a38e0 Bump minimist from 1.2.5 to 1.2.6
  • 70e1ff8 Merge pull request #170 from crazy-max/eslint
  • Additional commits viewable in compare view

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=docker/login-action&package-manager=github_actions&previous-version=1.14.1&new-version=2.0.0)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@dependabot rebase` will rebase this PR - `@dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@dependabot merge` will merge this PR after your CI passes on it - `@dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@dependabot cancel merge` will cancel a previously requested merge and block automerging - `@dependabot reopen` will reopen this PR if it is closed - `@dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
--- .github/workflows/docker.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index 7547e8591..0a006f9b9 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -43,7 +43,7 @@ jobs: - name: Login to DockerHub if: ${{ github.event_name != 'pull_request' }} - uses: docker/login-action@v1.14.1 + uses: docker/login-action@v2.0.0 with: username: ${{ secrets.DOCKERHUB_USERNAME }} password: ${{ secrets.DOCKERHUB_TOKEN }} From 4b36feaa2b1e6f2794becfa6023f74007c2b4abd Mon Sep 17 00:00:00 2001 From: William Banfield <4561443+williambanfield@users.noreply.github.com> Date: Mon, 9 May 2022 18:52:10 -0400 Subject: [PATCH 5/5] scripts/metricsgen: add the initial version of metricsgen (#8479) This pull requests adds a new tool, metricsgen, for generating Tendermint metrics constructors from `Metrics` struct definitions. This tool aims to reduce the amount of boilerplate required to add additional metrics to Tendermint. Its working is fairly simple, it parses the go ast, extracts field information, and uses this field information to execute a go template. This pull request also adds a proof-of-concept of the tool's output and working by using it to generate the [indexer metrics](https://github.com/tendermint/tendermint/pull/8479/files#diff-4b0c597b6fa05332a2f9a8e0ce079e360602942fae99dc5485f1edfe71c0a29e) using `//go:generate` directives and a simple `make` target. The next steps for this tool are documented in https://github.com/tendermint/tendermint/issues/8485 and https://github.com/tendermint/tendermint/issues/8486, which detail using the tool to generate the `metrics.md` documentation file and using the tool to migrate away from `go-kit`. --- Makefile | 15 + internal/state/indexer/metrics.gen.go | 51 +++ internal/state/indexer/metrics.go | 52 +-- scripts/metricsgen/metricsgen.go | 334 ++++++++++++++++++ scripts/metricsgen/metricsgen_test.go | 259 ++++++++++++++ .../metricsgen/testdata/basic/metrics.gen.go | 30 ++ scripts/metricsgen/testdata/basic/metrics.go | 11 + .../testdata/commented/metrics.gen.go | 30 ++ .../metricsgen/testdata/commented/metrics.go | 11 + .../metricsgen/testdata/tags/metrics.gen.go | 54 +++ scripts/metricsgen/testdata/tags/metrics.go | 12 + 11 files changed, 809 insertions(+), 50 deletions(-) create mode 100644 internal/state/indexer/metrics.gen.go create mode 100644 scripts/metricsgen/metricsgen.go create mode 100644 scripts/metricsgen/metricsgen_test.go create mode 100644 scripts/metricsgen/testdata/basic/metrics.gen.go create mode 100644 scripts/metricsgen/testdata/basic/metrics.go create mode 100644 scripts/metricsgen/testdata/commented/metrics.gen.go create mode 100644 scripts/metricsgen/testdata/commented/metrics.go create mode 100644 scripts/metricsgen/testdata/tags/metrics.gen.go create mode 100644 scripts/metricsgen/testdata/tags/metrics.go diff --git a/Makefile b/Makefile index a0b970f76..cd9380768 100644 --- a/Makefile +++ b/Makefile @@ -251,6 +251,21 @@ mockery: go generate -run="./scripts/mockery_generate.sh" ./... .PHONY: mockery +############################################################################### +### Metrics ### +############################################################################### + +metrics: testdata-metrics + go generate -run="scripts/metricsgen" ./... +.PHONY: metrics + + # By convention, the go tool ignores subdirectories of directories named + # 'testdata'. This command invokes the generate command on the folder directly + # to avoid this. +testdata-metrics: + ls ./scripts/metricsgen/testdata | xargs -I{} go generate -run="scripts/metricsgen" ./scripts/metricsgen/testdata/{} +.PHONY: testdata-metrics + ############################################################################### ### Local testnet using docker ### ############################################################################### diff --git a/internal/state/indexer/metrics.gen.go b/internal/state/indexer/metrics.gen.go new file mode 100644 index 000000000..8b079d8d5 --- /dev/null +++ b/internal/state/indexer/metrics.gen.go @@ -0,0 +1,51 @@ +// Code generated by metricsgen. DO NOT EDIT. + +package indexer + +import ( + "github.com/go-kit/kit/metrics/discard" + prometheus "github.com/go-kit/kit/metrics/prometheus" + stdprometheus "github.com/prometheus/client_golang/prometheus" +) + +func PrometheusMetrics(namespace string, labelsAndValues ...string) *Metrics { + labels := []string{} + for i := 0; i < len(labelsAndValues); i += 2 { + labels = append(labels, labelsAndValues[i]) + } + return &Metrics{ + BlockEventsSeconds: prometheus.NewHistogramFrom(stdprometheus.HistogramOpts{ + Namespace: namespace, + Subsystem: MetricsSubsystem, + Name: "block_events_seconds", + Help: "Latency for indexing block events.", + }, labels).With(labelsAndValues...), + TxEventsSeconds: prometheus.NewHistogramFrom(stdprometheus.HistogramOpts{ + Namespace: namespace, + Subsystem: MetricsSubsystem, + Name: "tx_events_seconds", + Help: "Latency for indexing transaction events.", + }, labels).With(labelsAndValues...), + BlocksIndexed: prometheus.NewCounterFrom(stdprometheus.CounterOpts{ + Namespace: namespace, + Subsystem: MetricsSubsystem, + Name: "blocks_indexed", + Help: "Number of complete blocks indexed.", + }, labels).With(labelsAndValues...), + TransactionsIndexed: prometheus.NewCounterFrom(stdprometheus.CounterOpts{ + Namespace: namespace, + Subsystem: MetricsSubsystem, + Name: "transactions_indexed", + Help: "Number of transactions indexed.", + }, labels).With(labelsAndValues...), + } +} + +func NopMetrics() *Metrics { + return &Metrics{ + BlockEventsSeconds: discard.NewHistogram(), + TxEventsSeconds: discard.NewHistogram(), + BlocksIndexed: discard.NewCounter(), + TransactionsIndexed: discard.NewCounter(), + } +} diff --git a/internal/state/indexer/metrics.go b/internal/state/indexer/metrics.go index aa64a4bb2..0b92b879e 100644 --- a/internal/state/indexer/metrics.go +++ b/internal/state/indexer/metrics.go @@ -2,12 +2,10 @@ package indexer import ( "github.com/go-kit/kit/metrics" - "github.com/go-kit/kit/metrics/discard" - - prometheus "github.com/go-kit/kit/metrics/prometheus" - stdprometheus "github.com/prometheus/client_golang/prometheus" ) +//go:generate go run github.com/tendermint/tendermint/scripts/metricsgen -struct=Metrics + // MetricsSubsystem is a the subsystem label for the indexer package. const MetricsSubsystem = "indexer" @@ -25,49 +23,3 @@ type Metrics struct { // Number of transactions indexed. TransactionsIndexed metrics.Counter } - -// PrometheusMetrics returns Metrics build using Prometheus client library. -// Optionally, labels can be provided along with their values ("foo", -// "fooValue"). -func PrometheusMetrics(namespace string, labelsAndValues ...string) *Metrics { - labels := []string{} - for i := 0; i < len(labelsAndValues); i += 2 { - labels = append(labels, labelsAndValues[i]) - } - return &Metrics{ - BlockEventsSeconds: prometheus.NewHistogramFrom(stdprometheus.HistogramOpts{ - Namespace: namespace, - Subsystem: MetricsSubsystem, - Name: "block_events_seconds", - Help: "Latency for indexing block events.", - }, labels).With(labelsAndValues...), - TxEventsSeconds: prometheus.NewHistogramFrom(stdprometheus.HistogramOpts{ - Namespace: namespace, - Subsystem: MetricsSubsystem, - Name: "tx_events_seconds", - Help: "Latency for indexing transaction events.", - }, labels).With(labelsAndValues...), - BlocksIndexed: prometheus.NewCounterFrom(stdprometheus.CounterOpts{ - Namespace: namespace, - Subsystem: MetricsSubsystem, - Name: "blocks_indexed", - Help: "Number of complete blocks indexed.", - }, labels).With(labelsAndValues...), - TransactionsIndexed: prometheus.NewCounterFrom(stdprometheus.CounterOpts{ - Namespace: namespace, - Subsystem: MetricsSubsystem, - Name: "transactions_indexed", - Help: "Number of transactions indexed.", - }, labels).With(labelsAndValues...), - } -} - -// NopMetrics returns an indexer metrics stub that discards all samples. -func NopMetrics() *Metrics { - return &Metrics{ - BlockEventsSeconds: discard.NewHistogram(), - TxEventsSeconds: discard.NewHistogram(), - BlocksIndexed: discard.NewCounter(), - TransactionsIndexed: discard.NewCounter(), - } -} diff --git a/scripts/metricsgen/metricsgen.go b/scripts/metricsgen/metricsgen.go new file mode 100644 index 000000000..70cb36a77 --- /dev/null +++ b/scripts/metricsgen/metricsgen.go @@ -0,0 +1,334 @@ +// metricsgen is a code generation tool for creating constructors for Tendermint +// metrics types. +package main + +import ( + "bytes" + "flag" + "fmt" + "go/ast" + "go/format" + "go/parser" + "go/token" + "go/types" + "io" + "io/fs" + "log" + "os" + "path" + "path/filepath" + "reflect" + "regexp" + "strconv" + "strings" + "text/template" +) + +func init() { + flag.Usage = func() { + fmt.Fprintf(os.Stderr, `Usage: %[1]s -struct + +Generate constructors for the metrics type specified by -struct contained in +the current directory. The tool creates a new file in the current directory +containing the generated code. + +Options: +`, filepath.Base(os.Args[0])) + flag.PrintDefaults() + } +} + +const metricsPackageName = "github.com/go-kit/kit/metrics" + +const ( + metricNameTag = "metrics_name" + labelsTag = "metrics_labels" + bucketTypeTag = "metrics_buckettype" + bucketSizeTag = "metrics_bucketsizes" +) + +var ( + dir = flag.String("dir", ".", "Path to the directory containing the target package") + strct = flag.String("struct", "Metrics", "Struct to parse for metrics") +) + +var bucketType = map[string]string{ + "exprange": "stdprometheus.ExponentialBucketsRange", + "exp": "stdprometheus.ExponentialBuckets", + "lin": "stdprometheus.LinearBuckets", +} + +var tmpl = template.Must(template.New("tmpl").Parse(`// Code generated by metricsgen. DO NOT EDIT. + +package {{ .Package }} + +import ( + "github.com/go-kit/kit/metrics/discard" + prometheus "github.com/go-kit/kit/metrics/prometheus" + stdprometheus "github.com/prometheus/client_golang/prometheus" +) + +func PrometheusMetrics(namespace string, labelsAndValues...string) *Metrics { + labels := []string{} + for i := 0; i < len(labelsAndValues); i += 2 { + labels = append(labels, labelsAndValues[i]) + } + return &Metrics{ + {{ range $metric := .ParsedMetrics }} + {{- $metric.FieldName }}: prometheus.New{{ $metric.TypeName }}From(stdprometheus.{{$metric.TypeName }}Opts{ + Namespace: namespace, + Subsystem: MetricsSubsystem, + Name: "{{$metric.MetricName }}", + Help: "{{ $metric.Description }}", + {{ if ne $metric.HistogramOptions.BucketType "" }} + Buckets: {{ $metric.HistogramOptions.BucketType }}({{ $metric.HistogramOptions.BucketSizes }}), + {{ else if ne $metric.HistogramOptions.BucketSizes "" }} + Buckets: []float64{ {{ $metric.HistogramOptions.BucketSizes }} }, + {{ end }} + {{- if eq (len $metric.Labels) 0 }} + }, labels).With(labelsAndValues...), + {{ else }} + }, append(labels, {{$metric.Labels | printf "%q" }})).With(labelsAndValues...), + {{- end }} + {{- end }} + } +} + + +func NopMetrics() *Metrics { + return &Metrics{ + {{- range $metric := .ParsedMetrics }} + {{ $metric.FieldName }}: discard.New{{ $metric.TypeName }}(), + {{- end }} + } +} +`)) + +// ParsedMetricField is the data parsed for a single field of a metric struct. +type ParsedMetricField struct { + TypeName string + FieldName string + MetricName string + Description string + Labels string + + HistogramOptions HistogramOpts +} + +type HistogramOpts struct { + BucketType string + BucketSizes string +} + +// TemplateData is all of the data required for rendering a metric file template. +type TemplateData struct { + Package string + ParsedMetrics []ParsedMetricField +} + +func main() { + flag.Parse() + if *strct == "" { + log.Fatal("You must specify a non-empty -struct") + } + td, err := ParseMetricsDir(".", *strct) + if err != nil { + log.Fatalf("Parsing file: %v", err) + } + out := filepath.Join(*dir, "metrics.gen.go") + f, err := os.Create(out) + if err != nil { + log.Fatalf("Opening file: %v", err) + } + err = GenerateMetricsFile(f, td) + if err != nil { + log.Fatalf("Generating code: %v", err) + } +} +func ignoreTestFiles(f fs.FileInfo) bool { + return !strings.Contains(f.Name(), "_test.go") +} + +// ParseMetricsDir parses the dir and scans for a struct matching structName, +// ignoring all test files. ParseMetricsDir iterates the fields of the metrics +// struct and builds a TemplateData using the data obtained from the abstract syntax tree. +func ParseMetricsDir(dir string, structName string) (TemplateData, error) { + fs := token.NewFileSet() + d, err := parser.ParseDir(fs, dir, ignoreTestFiles, parser.ParseComments) + if err != nil { + return TemplateData{}, err + } + if len(d) > 1 { + return TemplateData{}, fmt.Errorf("multiple packages found in %s", dir) + } + if len(d) == 0 { + return TemplateData{}, fmt.Errorf("no go pacakges found in %s", dir) + } + + // Grab the package name. + var pkgName string + var pkg *ast.Package + for pkgName, pkg = range d { + } + td := TemplateData{ + Package: pkgName, + } + // Grab the metrics struct + m, mPkgName, err := findMetricsStruct(pkg.Files, structName) + if err != nil { + return TemplateData{}, err + } + for _, f := range m.Fields.List { + if !isMetric(f.Type, mPkgName) { + continue + } + pmf := parseMetricField(f) + td.ParsedMetrics = append(td.ParsedMetrics, pmf) + } + + return td, err +} + +// GenerateMetricsFile executes the metrics file template, writing the result +// into the io.Writer. +func GenerateMetricsFile(w io.Writer, td TemplateData) error { + b := []byte{} + buf := bytes.NewBuffer(b) + err := tmpl.Execute(buf, td) + if err != nil { + return err + } + b, err = format.Source(buf.Bytes()) + if err != nil { + return err + } + _, err = io.Copy(w, bytes.NewBuffer(b)) + if err != nil { + return err + } + return nil +} + +func findMetricsStruct(files map[string]*ast.File, structName string) (*ast.StructType, string, error) { + var ( + st *ast.StructType + ) + for _, file := range files { + mPkgName, err := extractMetricsPackageName(file.Imports) + if err != nil { + return nil, "", fmt.Errorf("unable to determine metrics package name: %v", err) + } + if !ast.FilterFile(file, func(name string) bool { + return name == structName + }) { + continue + } + ast.Inspect(file, func(n ast.Node) bool { + switch f := n.(type) { + case *ast.TypeSpec: + if f.Name.Name == structName { + var ok bool + st, ok = f.Type.(*ast.StructType) + if !ok { + err = fmt.Errorf("found identifier for %q of wrong type", structName) + } + } + return false + default: + return true + } + }) + if err != nil { + return nil, "", err + } + if st != nil { + return st, mPkgName, nil + } + } + return nil, "", fmt.Errorf("target struct %q not found in dir", structName) +} + +func parseMetricField(f *ast.Field) ParsedMetricField { + var comment string + if f.Doc != nil { + for _, c := range f.Doc.List { + comment += strings.TrimPrefix(c.Text, "// ") + } + } + pmf := ParsedMetricField{ + Description: comment, + MetricName: extractFieldName(f.Names[0].String(), f.Tag), + FieldName: f.Names[0].String(), + TypeName: extractTypeName(f.Type), + Labels: extractLabels(f.Tag), + } + if pmf.TypeName == "Histogram" { + pmf.HistogramOptions = extractHistogramOptions(f.Tag) + } + return pmf +} + +func extractTypeName(e ast.Expr) string { + return strings.TrimPrefix(path.Ext(types.ExprString(e)), ".") +} + +func isMetric(e ast.Expr, mPkgName string) bool { + return strings.Contains(types.ExprString(e), fmt.Sprintf("%s.", mPkgName)) +} + +func extractLabels(bl *ast.BasicLit) string { + if bl != nil { + t := reflect.StructTag(strings.Trim(bl.Value, "`")) + if v := t.Get(labelsTag); v != "" { + return v + } + } + return "" +} + +func extractFieldName(name string, tag *ast.BasicLit) string { + if tag != nil { + t := reflect.StructTag(strings.Trim(tag.Value, "`")) + if v := t.Get(metricNameTag); v != "" { + return v + } + } + return toSnakeCase(name) +} + +func extractHistogramOptions(tag *ast.BasicLit) HistogramOpts { + h := HistogramOpts{} + if tag != nil { + t := reflect.StructTag(strings.Trim(tag.Value, "`")) + if v := t.Get(bucketTypeTag); v != "" { + h.BucketType = bucketType[v] + } + if v := t.Get(bucketSizeTag); v != "" { + h.BucketSizes = v + } + } + return h +} + +func extractMetricsPackageName(imports []*ast.ImportSpec) (string, error) { + for _, i := range imports { + u, err := strconv.Unquote(i.Path.Value) + if err != nil { + return "", err + } + if u == metricsPackageName { + if i.Name != nil { + return i.Name.Name, nil + } + return path.Base(u), nil + } + } + return "", nil +} + +var capitalChange = regexp.MustCompile("([a-z0-9])([A-Z])") + +func toSnakeCase(str string) string { + snake := capitalChange.ReplaceAllString(str, "${1}_${2}") + return strings.ToLower(snake) +} diff --git a/scripts/metricsgen/metricsgen_test.go b/scripts/metricsgen/metricsgen_test.go new file mode 100644 index 000000000..83251e651 --- /dev/null +++ b/scripts/metricsgen/metricsgen_test.go @@ -0,0 +1,259 @@ +package main_test + +import ( + "bytes" + "fmt" + "go/parser" + "go/token" + "io" + "io/ioutil" + "os" + "path" + "path/filepath" + "testing" + + "github.com/stretchr/testify/require" + metricsgen "github.com/tendermint/tendermint/scripts/metricsgen" +) + +const testDataDir = "./testdata" + +func TestSimpleTemplate(t *testing.T) { + m := metricsgen.ParsedMetricField{ + TypeName: "Histogram", + FieldName: "MyMetric", + MetricName: "request_count", + Description: "how many requests were made since the start of the process", + Labels: "first, second, third", + } + td := metricsgen.TemplateData{ + Package: "mypack", + ParsedMetrics: []metricsgen.ParsedMetricField{m}, + } + b := bytes.NewBuffer([]byte{}) + err := metricsgen.GenerateMetricsFile(b, td) + if err != nil { + t.Fatalf("unable to parse template %v", err) + } +} + +func TestFromData(t *testing.T) { + infos, err := ioutil.ReadDir(testDataDir) + if err != nil { + t.Fatalf("unable to open file %v", err) + } + for _, dir := range infos { + t.Run(dir.Name(), func(t *testing.T) { + if !dir.IsDir() { + t.Fatalf("expected file %s to be directory", dir.Name()) + } + dirName := path.Join(testDataDir, dir.Name()) + pt, err := metricsgen.ParseMetricsDir(dirName, "Metrics") + if err != nil { + t.Fatalf("unable to parse from dir %q: %v", dir, err) + } + outFile := path.Join(dirName, "out.go") + if err != nil { + t.Fatalf("unable to open file %s: %v", outFile, err) + } + of, err := os.Create(outFile) + if err != nil { + t.Fatalf("unable to open file %s: %v", outFile, err) + } + defer os.Remove(outFile) + if err := metricsgen.GenerateMetricsFile(of, pt); err != nil { + t.Fatalf("unable to generate metrics file %s: %v", outFile, err) + } + if _, err := parser.ParseFile(token.NewFileSet(), outFile, nil, parser.AllErrors); err != nil { + t.Fatalf("unable to parse generated file %s: %v", outFile, err) + } + bNew, err := ioutil.ReadFile(outFile) + if err != nil { + t.Fatalf("unable to read generated file %s: %v", outFile, err) + } + goldenFile := path.Join(dirName, "metrics.gen.go") + bOld, err := ioutil.ReadFile(goldenFile) + if err != nil { + t.Fatalf("unable to read file %s: %v", goldenFile, err) + } + if !bytes.Equal(bNew, bOld) { + t.Fatalf("newly generated code in file %s does not match golden file %s\n"+ + "if the output of the metricsgen tool is expected to change run the following make target: \n"+ + "\tmake metrics", outFile, goldenFile) + } + }) + } +} + +func TestParseMetricsStruct(t *testing.T) { + const pkgName = "mypkg" + metricsTests := []struct { + name string + shouldError bool + metricsStruct string + expected metricsgen.TemplateData + }{ + { + name: "basic", + metricsStruct: `type Metrics struct { + myGauge metrics.Gauge + }`, + expected: metricsgen.TemplateData{ + Package: pkgName, + ParsedMetrics: []metricsgen.ParsedMetricField{ + { + TypeName: "Gauge", + FieldName: "myGauge", + MetricName: "my_gauge", + }, + }, + }, + }, + { + name: "histogram", + metricsStruct: "type Metrics struct {\n" + + "myHistogram metrics.Histogram `metrics_buckettype:\"exp\" metrics_bucketsizes:\"1, 100, .8\"`\n" + + "}", + expected: metricsgen.TemplateData{ + Package: pkgName, + ParsedMetrics: []metricsgen.ParsedMetricField{ + { + TypeName: "Histogram", + FieldName: "myHistogram", + MetricName: "my_histogram", + + HistogramOptions: metricsgen.HistogramOpts{ + BucketType: "stdprometheus.ExponentialBuckets", + BucketSizes: "1, 100, .8", + }, + }, + }, + }, + }, + { + name: "labeled name", + metricsStruct: "type Metrics struct {\n" + + "myCounter metrics.Counter `metrics_name:\"new_name\"`\n" + + "}", + expected: metricsgen.TemplateData{ + Package: pkgName, + ParsedMetrics: []metricsgen.ParsedMetricField{ + { + TypeName: "Counter", + FieldName: "myCounter", + MetricName: "new_name", + }, + }, + }, + }, + { + name: "metric labels", + metricsStruct: "type Metrics struct {\n" + + "myCounter metrics.Counter `metrics_labels:\"label1, label2\"`\n" + + "}", + expected: metricsgen.TemplateData{ + Package: pkgName, + ParsedMetrics: []metricsgen.ParsedMetricField{ + { + TypeName: "Counter", + FieldName: "myCounter", + MetricName: "my_counter", + Labels: "label1, label2", + }, + }, + }, + }, + { + name: "ignore non-metric field", + metricsStruct: `type Metrics struct { + myCounter metrics.Counter + nonMetric string + }`, + expected: metricsgen.TemplateData{ + Package: pkgName, + ParsedMetrics: []metricsgen.ParsedMetricField{ + { + TypeName: "Counter", + FieldName: "myCounter", + MetricName: "my_counter", + }, + }, + }, + }, + } + for _, testCase := range metricsTests { + t.Run(testCase.name, func(t *testing.T) { + dir, err := os.MkdirTemp(os.TempDir(), "metricsdir") + if err != nil { + t.Fatalf("unable to create directory: %v", err) + } + defer os.Remove(dir) + f, err := os.Create(filepath.Join(dir, "metrics.go")) + if err != nil { + t.Fatalf("unable to open file: %v", err) + } + pkgLine := fmt.Sprintf("package %s\n", pkgName) + importClause := ` + import( + "github.com/go-kit/kit/metrics" + ) + ` + + _, err = io.WriteString(f, pkgLine) + require.NoError(t, err) + _, err = io.WriteString(f, importClause) + require.NoError(t, err) + _, err = io.WriteString(f, testCase.metricsStruct) + require.NoError(t, err) + + td, err := metricsgen.ParseMetricsDir(dir, "Metrics") + if testCase.shouldError { + require.Error(t, err) + } else { + require.NoError(t, err) + require.Equal(t, testCase.expected, td) + } + }) + } +} + +func TestParseAliasedMetric(t *testing.T) { + aliasedData := ` + package mypkg + + import( + mymetrics "github.com/go-kit/kit/metrics" + ) + type Metrics struct { + m mymetrics.Gauge + } + ` + dir, err := os.MkdirTemp(os.TempDir(), "metricsdir") + if err != nil { + t.Fatalf("unable to create directory: %v", err) + } + defer os.Remove(dir) + f, err := os.Create(filepath.Join(dir, "metrics.go")) + if err != nil { + t.Fatalf("unable to open file: %v", err) + } + _, err = io.WriteString(f, aliasedData) + if err != nil { + t.Fatalf("unable to write to file: %v", err) + } + td, err := metricsgen.ParseMetricsDir(dir, "Metrics") + require.NoError(t, err) + + expected := + metricsgen.TemplateData{ + Package: "mypkg", + ParsedMetrics: []metricsgen.ParsedMetricField{ + { + TypeName: "Gauge", + FieldName: "m", + MetricName: "m", + }, + }, + } + require.Equal(t, expected, td) +} diff --git a/scripts/metricsgen/testdata/basic/metrics.gen.go b/scripts/metricsgen/testdata/basic/metrics.gen.go new file mode 100644 index 000000000..d541cb2db --- /dev/null +++ b/scripts/metricsgen/testdata/basic/metrics.gen.go @@ -0,0 +1,30 @@ +// Code generated by metricsgen. DO NOT EDIT. + +package basic + +import ( + "github.com/go-kit/kit/metrics/discard" + prometheus "github.com/go-kit/kit/metrics/prometheus" + stdprometheus "github.com/prometheus/client_golang/prometheus" +) + +func PrometheusMetrics(namespace string, labelsAndValues ...string) *Metrics { + labels := []string{} + for i := 0; i < len(labelsAndValues); i += 2 { + labels = append(labels, labelsAndValues[i]) + } + return &Metrics{ + Height: prometheus.NewGaugeFrom(stdprometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: MetricsSubsystem, + Name: "height", + Help: "simple metric that tracks the height of the chain.", + }, labels).With(labelsAndValues...), + } +} + +func NopMetrics() *Metrics { + return &Metrics{ + Height: discard.NewGauge(), + } +} diff --git a/scripts/metricsgen/testdata/basic/metrics.go b/scripts/metricsgen/testdata/basic/metrics.go new file mode 100644 index 000000000..1a361f90f --- /dev/null +++ b/scripts/metricsgen/testdata/basic/metrics.go @@ -0,0 +1,11 @@ +package basic + +import "github.com/go-kit/kit/metrics" + +//go:generate go run ../../../../scripts/metricsgen -struct=Metrics + +// Metrics contains metrics exposed by this package. +type Metrics struct { + // simple metric that tracks the height of the chain. + Height metrics.Gauge +} diff --git a/scripts/metricsgen/testdata/commented/metrics.gen.go b/scripts/metricsgen/testdata/commented/metrics.gen.go new file mode 100644 index 000000000..038da3d46 --- /dev/null +++ b/scripts/metricsgen/testdata/commented/metrics.gen.go @@ -0,0 +1,30 @@ +// Code generated by metricsgen. DO NOT EDIT. + +package commented + +import ( + "github.com/go-kit/kit/metrics/discard" + prometheus "github.com/go-kit/kit/metrics/prometheus" + stdprometheus "github.com/prometheus/client_golang/prometheus" +) + +func PrometheusMetrics(namespace string, labelsAndValues ...string) *Metrics { + labels := []string{} + for i := 0; i < len(labelsAndValues); i += 2 { + labels = append(labels, labelsAndValues[i]) + } + return &Metrics{ + Field: prometheus.NewGaugeFrom(stdprometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: MetricsSubsystem, + Name: "field", + Help: "Height of the chain.We expect multi-line comments to parse correctly.", + }, labels).With(labelsAndValues...), + } +} + +func NopMetrics() *Metrics { + return &Metrics{ + Field: discard.NewGauge(), + } +} diff --git a/scripts/metricsgen/testdata/commented/metrics.go b/scripts/metricsgen/testdata/commented/metrics.go new file mode 100644 index 000000000..174f1e233 --- /dev/null +++ b/scripts/metricsgen/testdata/commented/metrics.go @@ -0,0 +1,11 @@ +package commented + +import "github.com/go-kit/kit/metrics" + +//go:generate go run ../../../../scripts/metricsgen -struct=Metrics + +type Metrics struct { + // Height of the chain. + // We expect multi-line comments to parse correctly. + Field metrics.Gauge +} diff --git a/scripts/metricsgen/testdata/tags/metrics.gen.go b/scripts/metricsgen/testdata/tags/metrics.gen.go new file mode 100644 index 000000000..7ac292d3c --- /dev/null +++ b/scripts/metricsgen/testdata/tags/metrics.gen.go @@ -0,0 +1,54 @@ +// Code generated by metricsgen. DO NOT EDIT. + +package tags + +import ( + "github.com/go-kit/kit/metrics/discard" + prometheus "github.com/go-kit/kit/metrics/prometheus" + stdprometheus "github.com/prometheus/client_golang/prometheus" +) + +func PrometheusMetrics(namespace string, labelsAndValues ...string) *Metrics { + labels := []string{} + for i := 0; i < len(labelsAndValues); i += 2 { + labels = append(labels, labelsAndValues[i]) + } + return &Metrics{ + WithLabels: prometheus.NewCounterFrom(stdprometheus.CounterOpts{ + Namespace: namespace, + Subsystem: MetricsSubsystem, + Name: "with_labels", + Help: "", + }, append(labels, "step,time")).With(labelsAndValues...), WithExpBuckets: prometheus.NewHistogramFrom(stdprometheus.HistogramOpts{ + Namespace: namespace, + Subsystem: MetricsSubsystem, + Name: "with_exp_buckets", + Help: "", + + Buckets: stdprometheus.ExponentialBuckets(.1, 100, 8), + }, labels).With(labelsAndValues...), + WithBuckets: prometheus.NewHistogramFrom(stdprometheus.HistogramOpts{ + Namespace: namespace, + Subsystem: MetricsSubsystem, + Name: "with_buckets", + Help: "", + + Buckets: []float64{1, 2, 3, 4, 5}, + }, labels).With(labelsAndValues...), + Named: prometheus.NewCounterFrom(stdprometheus.CounterOpts{ + Namespace: namespace, + Subsystem: MetricsSubsystem, + Name: "metric_with_name", + Help: "", + }, labels).With(labelsAndValues...), + } +} + +func NopMetrics() *Metrics { + return &Metrics{ + WithLabels: discard.NewCounter(), + WithExpBuckets: discard.NewHistogram(), + WithBuckets: discard.NewHistogram(), + Named: discard.NewCounter(), + } +} diff --git a/scripts/metricsgen/testdata/tags/metrics.go b/scripts/metricsgen/testdata/tags/metrics.go new file mode 100644 index 000000000..8562dcf43 --- /dev/null +++ b/scripts/metricsgen/testdata/tags/metrics.go @@ -0,0 +1,12 @@ +package tags + +import "github.com/go-kit/kit/metrics" + +//go:generate go run ../../../../scripts/metricsgen -struct=Metrics + +type Metrics struct { + WithLabels metrics.Counter `metrics_labels:"step,time"` + WithExpBuckets metrics.Histogram `metrics_buckettype:"exp" metrics_bucketsizes:".1,100,8"` + WithBuckets metrics.Histogram `metrics_bucketsizes:"1, 2, 3, 4, 5"` + Named metrics.Counter `metrics_name:"metric_with_name"` +}