consensus: add additional metrics for abci++ data (#8480)

This pull request adds an additional set of metrics targeted at providing more visibility into `abci++`. 

The following set of metrics are added and exposed through the `metrics` endpoint:

```
tendermint_consensus_proposal_receive_count{chain_id="test-chain-IrF74Y",status="accepted"} 34
tendermint_consensus_proposal_create_count{chain_id="test-chain-IrF74Y"} 34
tendermint_consensus_vote_extension_receive_count{chain_id="test-chain-IrF74Y",status="accepted"} 34
tendermint_consensus_round_voting_power_percent{chain_id="test-chain-IrF74Y",vote_type="precommit"} 1
tendermint_consensus_round_voting_power_percent{chain_id="test-chain-IrF74Y",vote_type="prevote"} 1
tendermint_state_consensus_param_updates{chain_id="test-chain-IrF74Y"} 0
tendermint_state_validator_set_updates{chain_id="test-chain-IrF74Y"} 0
tendermint_consensus_late_votes{chain_id="test-chain-IrF74Y",vote_type="precommit"} 16
```

This pull request also updates the `metrics.md` file to include some metrics that were previously missed. My hope is to generate the `metrics.md` file with a future version of the tool being architected in #8479
This commit is contained in:
William Banfield
2022-05-10 12:48:13 -04:00
committed by GitHub
parent a4c3b5cab4
commit c052181e32
5 changed files with 196 additions and 36 deletions

View File

@@ -18,40 +18,53 @@ Listen address can be changed in the config file (see
The following metrics are available:
| **Name** | **Type** | **Tags** | **Description** |
| -------------------------------------- | --------- | ------------- | ---------------------------------------------------------------------- |
| abci_connection_method_timing | Histogram | method, type | Timings for each of the ABCI methods |
| consensus_height | Gauge | | Height of the chain |
| consensus_validators | Gauge | | Number of validators |
| consensus_validators_power | Gauge | | Total voting power of all validators |
| consensus_validator_power | Gauge | | Voting power of the node if in the validator set |
| consensus_validator_last_signed_height | Gauge | | Last height the node signed a block, if the node is a validator |
| consensus_validator_missed_blocks | Gauge | | Total amount of blocks missed for the node, if the node is a validator |
| consensus_missing_validators | Gauge | | Number of validators who did not sign |
| consensus_missing_validators_power | Gauge | | Total voting power of the missing validators |
| consensus_byzantine_validators | Gauge | | Number of validators who tried to double sign |
| consensus_byzantine_validators_power | Gauge | | Total voting power of the byzantine validators |
| consensus_block_interval_seconds | Histogram | | Time between this and last block (Block.Header.Time) in seconds |
| consensus_rounds | Gauge | | Number of rounds |
| consensus_num_txs | Gauge | | Number of transactions |
| consensus_total_txs | Gauge | | Total number of transactions committed |
| consensus_block_parts | counter | peer_id | number of blockparts transmitted by peer |
| consensus_latest_block_height | gauge | | /status sync_info number |
| consensus_fast_syncing | gauge | | either 0 (not fast syncing) or 1 (syncing) |
| consensus_state_syncing | gauge | | either 0 (not state syncing) or 1 (syncing) |
| consensus_block_size_bytes | Gauge | | Block size in bytes |
| evidence_pool_num_evidence | Gauge | | Number of evidence in the evidence pool
| p2p_peers | Gauge | | Number of peers node's connected to |
| p2p_peer_receive_bytes_total | counter | peer_id, chID | number of bytes per channel received from a given peer |
| p2p_peer_send_bytes_total | counter | peer_id, chID | number of bytes per channel sent to a given peer |
| p2p_peer_pending_send_bytes | gauge | peer_id | number of pending bytes to be sent to a given peer |
| p2p_num_txs | gauge | peer_id | number of transactions submitted by each peer_id |
| p2p_pending_send_bytes | gauge | peer_id | amount of data pending to be sent to peer |
| mempool_size | Gauge | | Number of uncommitted transactions |
| mempool_tx_size_bytes | histogram | | transaction sizes in bytes |
| mempool_failed_txs | counter | | number of failed transactions |
| mempool_recheck_times | counter | | number of transactions rechecked in the mempool |
| state_block_processing_time | histogram | | time between BeginBlock and EndBlock in ms |
| **Name** | **Type** | **Tags** | **Description** |
|-----------------------------------------|-----------|-----------------|--------------------------------------------------------------------------------------------------------------------------------------------|
| abci_connection_method_timing | Histogram | method, type | Timings for each of the ABCI methods |
| consensus_height | Gauge | | Height of the chain |
| consensus_validators | Gauge | | Number of validators |
| consensus_validators_power | Gauge | | Total voting power of all validators |
| consensus_validator_power | Gauge | | Voting power of the node if in the validator set |
| consensus_validator_last_signed_height | Gauge | | Last height the node signed a block, if the node is a validator |
| consensus_validator_missed_blocks | Gauge | | Total amount of blocks missed for the node, if the node is a validator |
| consensus_missing_validators | Gauge | | Number of validators who did not sign |
| consensus_missing_validators_power | Gauge | | Total voting power of the missing validators |
| consensus_byzantine_validators | Gauge | | Number of validators who tried to double sign |
| consensus_byzantine_validators_power | Gauge | | Total voting power of the byzantine validators |
| consensus_block_interval_seconds | Histogram | | Time between this and last block (Block.Header.Time) in seconds |
| consensus_rounds | Gauge | | Number of rounds |
| consensus_num_txs | Gauge | | Number of transactions |
| consensus_total_txs | Gauge | | Total number of transactions committed |
| consensus_block_parts | Counter | peer_id | number of blockparts transmitted by peer |
| consensus_latest_block_height | gauge | | /status sync_info number |
| consensus_fast_syncing | gauge | | either 0 (not fast syncing) or 1 (syncing) |
| consensus_state_syncing | gauge | | either 0 (not state syncing) or 1 (syncing) |
| consensus_block_size_bytes | Gauge | | Block size in bytes |
| consensus_step_duration | Histogram | step | Histogram of durations for each step in the consensus protocol |
| consensus_block_gossip_receive_latency | Histogram | | Histogram of time taken to receive a block in seconds, measure between when a new block is first discovered to when the block is completed |
| consensus_block_gossip_parts_received | Counter | matches_current | Number of block parts received by the node |
| consensus_quorum_prevote_delay | Gauge | | Interval in seconds between the proposal timestamp and the timestamp of the earliest prevote that achieved a quorum |
| consensus_full_prevote_delay | Gauge | | Interval in seconds between the proposal timestamp and the timestamp of the latest prevote in a round where all validators voted |
| consensus_proposal_timestamp_difference | Histogram | | Difference between the timestamp in the proposal message and the local time of the validator at the time it received the message |
| consensus_vote_extension_receive_count | Counter | status | Number of vote extensions received |
| consensus_proposal_receive_count | Counter | status | Total number of proposals received by the node since process start |
| consensus_proposal_create_count | Counter | | Total number of proposals created by the node since process start |
| consensus_round_voting_power_percent | Gauge | vote_type | A value between 0 and 1.0 representing the percentage of the total voting power per vote type received within a round |
| consensus_late_votes | Counter | vote_type | Number of votes received by the node since process start that correspond to earlier heights and rounds than this node is currently in. |
| evidence_pool_num_evidence | Gauge | | Number of evidence in the evidence pool |
| p2p_peers | Gauge | | Number of peers node's connected to |
| p2p_peer_receive_bytes_total | Counter | peer_id, chID | number of bytes per channel received from a given peer |
| p2p_peer_send_bytes_total | Counter | peer_id, chID | number of bytes per channel sent to a given peer |
| p2p_peer_pending_send_bytes | Gauge | peer_id | number of pending bytes to be sent to a given peer |
| p2p_num_txs | Gauge | peer_id | number of transactions submitted by each peer_id |
| p2p_pending_send_bytes | Gauge | peer_id | amount of data pending to be sent to peer |
| mempool_size | Gauge | | Number of uncommitted transactions |
| mempool_tx_size_bytes | Histogram | | transaction sizes in bytes |
| mempool_failed_txs | Counter | | number of failed transactions |
| mempool_recheck_times | Counter | | number of transactions rechecked in the mempool |
| state_block_processing_time | Histogram | | time between BeginBlock and EndBlock in ms |
| state_consensus_param_updates | Counter | | number of consensus parameter updates returned by the application since process start |
| state_validator_set_updates | Counter | | number of validator set updates returned by the application since process start |
## Useful queries

View File

@@ -8,6 +8,7 @@ import (
"github.com/go-kit/kit/metrics/discard"
cstypes "github.com/tendermint/tendermint/internal/consensus/types"
tmproto "github.com/tendermint/tendermint/proto/tendermint/types"
"github.com/tendermint/tendermint/types"
prometheus "github.com/go-kit/kit/metrics/prometheus"
@@ -103,6 +104,33 @@ type Metrics struct {
// the proposal message and the local time of the validator at the time
// that the validator received the message.
ProposalTimestampDifference metrics.Histogram
// VoteExtensionReceiveCount is the number of vote extensions received by this
// node. The metric is annotated by the status of the vote extension from the
// application, either 'accepted' or 'rejected'.
VoteExtensionReceiveCount metrics.Counter
// ProposalReceiveCount is the total number of proposals received by this node
// since process start.
// The metric is annotated by the status of the proposal from the application,
// either 'accepted' or 'rejected'.
ProposalReceiveCount metrics.Counter
// ProposalCreationCount is the total number of proposals created by this node
// since process start.
// The metric is annotated by the status of the proposal from the application,
// either 'accepted' or 'rejected'.
ProposalCreateCount metrics.Counter
// RoundVotingPowerPercent is the percentage of the total voting power received
// with a round. The value begins at 0 for each round and approaches 1.0 as
// additional voting power is observed. The metric is labeled by vote type.
RoundVotingPowerPercent metrics.Gauge
// LateVotes stores the number of votes that were received by this node that
// correspond to earlier heights and rounds than this node is currently
// in.
LateVotes metrics.Counter
}
// PrometheusMetrics returns Metrics build using Prometheus client library.
@@ -280,6 +308,43 @@ func PrometheusMetrics(namespace string, labelsAndValues ...string) *Metrics {
"Only calculated when a new block is proposed.",
Buckets: []float64{-10, -.5, -.025, 0, .1, .5, 1, 1.5, 2, 10},
}, append(labels, "is_timely")).With(labelsAndValues...),
VoteExtensionReceiveCount: prometheus.NewCounterFrom(stdprometheus.CounterOpts{
Namespace: namespace,
Subsystem: MetricsSubsystem,
Name: "vote_extension_receive_count",
Help: "Number of vote extensions received by the node since process start, labeled by " +
"the application's response to VerifyVoteExtension, either accept or reject.",
}, append(labels, "status")).With(labelsAndValues...),
ProposalReceiveCount: prometheus.NewCounterFrom(stdprometheus.CounterOpts{
Namespace: namespace,
Subsystem: MetricsSubsystem,
Name: "proposal_receive_count",
Help: "Number of vote proposals received by the node since process start, labeled by " +
"the application's response to ProcessProposal, either accept or reject.",
}, append(labels, "status")).With(labelsAndValues...),
ProposalCreateCount: prometheus.NewCounterFrom(stdprometheus.CounterOpts{
Namespace: namespace,
Subsystem: MetricsSubsystem,
Name: "proposal_create_count",
Help: "Number of proposals created by the node since process start.",
}, labels).With(labelsAndValues...),
RoundVotingPowerPercent: prometheus.NewGaugeFrom(stdprometheus.GaugeOpts{
Namespace: namespace,
Subsystem: MetricsSubsystem,
Name: "round_voting_power_percent",
Help: "Percentage of the total voting power received with a round. " +
"The value begins at 0 for each round and approaches 1.0 as additional " +
"voting power is observed.",
}, append(labels, "vote_type")).With(labelsAndValues...),
LateVotes: prometheus.NewCounterFrom(stdprometheus.CounterOpts{
Namespace: namespace,
Subsystem: MetricsSubsystem,
Name: "late_votes",
Help: "Number of votes received by the node since process start that correspond to earlier heights and rounds than this node is currently in.",
}, append(labels, "vote_type")).With(labelsAndValues...),
}
}
@@ -317,6 +382,11 @@ func NopMetrics() *Metrics {
QuorumPrevoteDelay: discard.NewGauge(),
FullPrevoteDelay: discard.NewGauge(),
ProposalTimestampDifference: discard.NewHistogram(),
VoteExtensionReceiveCount: discard.NewCounter(),
ProposalReceiveCount: discard.NewCounter(),
ProposalCreateCount: discard.NewCounter(),
RoundVotingPowerPercent: discard.NewGauge(),
LateVotes: discard.NewCounter(),
}
}
@@ -336,10 +406,45 @@ func (m *Metrics) MarkBlockGossipComplete() {
m.BlockGossipReceiveLatency.Observe(time.Since(m.blockGossipStart).Seconds())
}
func (m *Metrics) MarkProposalProcessed(accepted bool) {
status := "accepted"
if !accepted {
status = "rejected"
}
m.ProposalReceiveCount.With("status", status).Add(1)
}
func (m *Metrics) MarkVoteExtensionReceived(accepted bool) {
status := "accepted"
if !accepted {
status = "rejected"
}
m.VoteExtensionReceiveCount.With("status", status).Add(1)
}
func (m *Metrics) MarkVoteReceived(vt tmproto.SignedMsgType, power, totalPower int64) {
p := float64(power) / float64(totalPower)
n := strings.ToLower(strings.TrimPrefix(vt.String(), "SIGNED_MSG_TYPE_"))
m.RoundVotingPowerPercent.With("vote_type", n).Add(p)
}
func (m *Metrics) MarkRound(r int32, st time.Time) {
m.Rounds.Set(float64(r))
roundTime := time.Since(st).Seconds()
m.RoundDuration.Observe(roundTime)
pvt := tmproto.PrevoteType
pvn := strings.ToLower(strings.TrimPrefix(pvt.String(), "SIGNED_MSG_TYPE_"))
m.RoundVotingPowerPercent.With("vote_type", pvn).Set(0)
pct := tmproto.PrecommitType
pcn := strings.ToLower(strings.TrimPrefix(pct.String(), "SIGNED_MSG_TYPE_"))
m.RoundVotingPowerPercent.With("vote_type", pcn).Set(0)
}
func (m *Metrics) MarkLateVote(vt tmproto.SignedMsgType) {
n := strings.ToLower(strings.TrimPrefix(vt.String(), "SIGNED_MSG_TYPE_"))
m.LateVotes.With("vote_type", n).Add(1)
}
func (m *Metrics) MarkStep(s cstypes.RoundStepType) {

View File

@@ -1334,6 +1334,7 @@ func (cs *State) defaultDecideProposal(ctx context.Context, height int64, round
} else if block == nil {
return
}
cs.metrics.ProposalCreateCount.Add(1)
blockParts, err = block.MakePartSet(types.BlockPartSizeBytes)
if err != nil {
cs.logger.Error("unable to create proposal block part set", "error", err)
@@ -1531,6 +1532,7 @@ func (cs *State) defaultDoPrevote(ctx context.Context, height int64, round int32
if err != nil {
panic(fmt.Sprintf("ProcessProposal: %v", err))
}
cs.metrics.MarkProposalProcessed(isAppValid)
// Vote nil if the Application rejected the block
if !isAppValid {
@@ -2297,6 +2299,10 @@ func (cs *State) addVote(
"cs_height", cs.Height,
)
if vote.Height < cs.Height || (vote.Height == cs.Height && vote.Round < cs.Round) {
cs.metrics.MarkLateVote(vote.Type)
}
// A precommit for the previous height?
// These come in while we wait timeoutCommit
if vote.Height+1 == cs.Height && vote.Type == tmproto.PrecommitType {
@@ -2337,7 +2343,9 @@ func (cs *State) addVote(
// Verify VoteExtension if precommit
if vote.Type == tmproto.PrecommitType {
if err = cs.blockExec.VerifyVoteExtension(ctx, vote); err != nil {
err := cs.blockExec.VerifyVoteExtension(ctx, vote)
cs.metrics.MarkVoteExtensionReceived(err == nil)
if err != nil {
return false, err
}
}
@@ -2348,6 +2356,11 @@ func (cs *State) addVote(
// Either duplicate, or error upon cs.Votes.AddByIndex()
return
}
if vote.Round == cs.Round {
vals := cs.state.Validators
_, val := vals.GetByIndex(vote.ValidatorIndex)
cs.metrics.MarkVoteReceived(vote.Type, val.VotingPower, vals.TotalVotingPower())
}
if err := cs.eventBus.PublishEventVote(types.EventDataVote{Vote: vote}); err != nil {
return added, err

View File

@@ -247,6 +247,10 @@ func (blockExec *BlockExecutor) ApplyBlock(
}
if len(validatorUpdates) > 0 {
blockExec.logger.Debug("updates to validators", "updates", types.ValidatorListString(validatorUpdates))
blockExec.metrics.ValidatorSetUpdates.Add(1)
}
if finalizeBlockResponse.ConsensusParamUpdates != nil {
blockExec.metrics.ConsensusParamUpdates.Add(1)
}
// Update the state with the block and responses.

View File

@@ -17,6 +17,14 @@ const (
type Metrics struct {
// Time between BeginBlock and EndBlock.
BlockProcessingTime metrics.Histogram
// ConsensusParamUpdates is the total number of times the application has
// udated the consensus params since process start.
ConsensusParamUpdates metrics.Counter
// ValidatorSetUpdates is the total number of times the application has
// udated the validator set since process start.
ValidatorSetUpdates metrics.Counter
}
// PrometheusMetrics returns Metrics build using Prometheus client library.
@@ -35,12 +43,29 @@ func PrometheusMetrics(namespace string, labelsAndValues ...string) *Metrics {
Help: "Time between BeginBlock and EndBlock in ms.",
Buckets: stdprometheus.LinearBuckets(1, 10, 10),
}, labels).With(labelsAndValues...),
ConsensusParamUpdates: prometheus.NewCounterFrom(stdprometheus.CounterOpts{
Namespace: namespace,
Subsystem: MetricsSubsystem,
Name: "consensus_param_updates",
Help: "The total number of times the application as updated the consensus " +
"parameters since process start.",
}, labels).With(labelsAndValues...),
ValidatorSetUpdates: prometheus.NewCounterFrom(stdprometheus.CounterOpts{
Namespace: namespace,
Subsystem: MetricsSubsystem,
Name: "validator_set_updates",
Help: "The total number of times the application as updated the validator " +
"set since process start.",
}, labels).With(labelsAndValues...),
}
}
// NopMetrics returns no-op Metrics.
func NopMetrics() *Metrics {
return &Metrics{
BlockProcessingTime: discard.NewHistogram(),
BlockProcessingTime: discard.NewHistogram(),
ConsensusParamUpdates: discard.NewCounter(),
ValidatorSetUpdates: discard.NewCounter(),
}
}