diff --git a/docs/nodes/metrics.md b/docs/nodes/metrics.md index 1b2e9f007..7b0622519 100644 --- a/docs/nodes/metrics.md +++ b/docs/nodes/metrics.md @@ -18,40 +18,53 @@ Listen address can be changed in the config file (see The following metrics are available: -| **Name** | **Type** | **Tags** | **Description** | -| -------------------------------------- | --------- | ------------- | ---------------------------------------------------------------------- | -| abci_connection_method_timing | Histogram | method, type | Timings for each of the ABCI methods | -| consensus_height | Gauge | | Height of the chain | -| consensus_validators | Gauge | | Number of validators | -| consensus_validators_power | Gauge | | Total voting power of all validators | -| consensus_validator_power | Gauge | | Voting power of the node if in the validator set | -| consensus_validator_last_signed_height | Gauge | | Last height the node signed a block, if the node is a validator | -| consensus_validator_missed_blocks | Gauge | | Total amount of blocks missed for the node, if the node is a validator | -| consensus_missing_validators | Gauge | | Number of validators who did not sign | -| consensus_missing_validators_power | Gauge | | Total voting power of the missing validators | -| consensus_byzantine_validators | Gauge | | Number of validators who tried to double sign | -| consensus_byzantine_validators_power | Gauge | | Total voting power of the byzantine validators | -| consensus_block_interval_seconds | Histogram | | Time between this and last block (Block.Header.Time) in seconds | -| consensus_rounds | Gauge | | Number of rounds | -| consensus_num_txs | Gauge | | Number of transactions | -| consensus_total_txs | Gauge | | Total number of transactions committed | -| consensus_block_parts | counter | peer_id | number of blockparts transmitted by peer | -| consensus_latest_block_height | gauge | | /status sync_info number | -| consensus_fast_syncing | gauge | | either 0 (not fast syncing) or 1 (syncing) | -| consensus_state_syncing | gauge | | either 0 (not state syncing) or 1 (syncing) | -| consensus_block_size_bytes | Gauge | | Block size in bytes | -| evidence_pool_num_evidence | Gauge | | Number of evidence in the evidence pool -| p2p_peers | Gauge | | Number of peers node's connected to | -| p2p_peer_receive_bytes_total | counter | peer_id, chID | number of bytes per channel received from a given peer | -| p2p_peer_send_bytes_total | counter | peer_id, chID | number of bytes per channel sent to a given peer | -| p2p_peer_pending_send_bytes | gauge | peer_id | number of pending bytes to be sent to a given peer | -| p2p_num_txs | gauge | peer_id | number of transactions submitted by each peer_id | -| p2p_pending_send_bytes | gauge | peer_id | amount of data pending to be sent to peer | -| mempool_size | Gauge | | Number of uncommitted transactions | -| mempool_tx_size_bytes | histogram | | transaction sizes in bytes | -| mempool_failed_txs | counter | | number of failed transactions | -| mempool_recheck_times | counter | | number of transactions rechecked in the mempool | -| state_block_processing_time | histogram | | time between BeginBlock and EndBlock in ms | +| **Name** | **Type** | **Tags** | **Description** | +|-----------------------------------------|-----------|-----------------|--------------------------------------------------------------------------------------------------------------------------------------------| +| abci_connection_method_timing | Histogram | method, type | Timings for each of the ABCI methods | +| consensus_height | Gauge | | Height of the chain | +| consensus_validators | Gauge | | Number of validators | +| consensus_validators_power | Gauge | | Total voting power of all validators | +| consensus_validator_power | Gauge | | Voting power of the node if in the validator set | +| consensus_validator_last_signed_height | Gauge | | Last height the node signed a block, if the node is a validator | +| consensus_validator_missed_blocks | Gauge | | Total amount of blocks missed for the node, if the node is a validator | +| consensus_missing_validators | Gauge | | Number of validators who did not sign | +| consensus_missing_validators_power | Gauge | | Total voting power of the missing validators | +| consensus_byzantine_validators | Gauge | | Number of validators who tried to double sign | +| consensus_byzantine_validators_power | Gauge | | Total voting power of the byzantine validators | +| consensus_block_interval_seconds | Histogram | | Time between this and last block (Block.Header.Time) in seconds | +| consensus_rounds | Gauge | | Number of rounds | +| consensus_num_txs | Gauge | | Number of transactions | +| consensus_total_txs | Gauge | | Total number of transactions committed | +| consensus_block_parts | Counter | peer_id | number of blockparts transmitted by peer | +| consensus_latest_block_height | gauge | | /status sync_info number | +| consensus_fast_syncing | gauge | | either 0 (not fast syncing) or 1 (syncing) | +| consensus_state_syncing | gauge | | either 0 (not state syncing) or 1 (syncing) | +| consensus_block_size_bytes | Gauge | | Block size in bytes | +| consensus_step_duration | Histogram | step | Histogram of durations for each step in the consensus protocol | +| consensus_block_gossip_receive_latency | Histogram | | Histogram of time taken to receive a block in seconds, measure between when a new block is first discovered to when the block is completed | +| consensus_block_gossip_parts_received | Counter | matches_current | Number of block parts received by the node | +| consensus_quorum_prevote_delay | Gauge | | Interval in seconds between the proposal timestamp and the timestamp of the earliest prevote that achieved a quorum | +| consensus_full_prevote_delay | Gauge | | Interval in seconds between the proposal timestamp and the timestamp of the latest prevote in a round where all validators voted | +| consensus_proposal_timestamp_difference | Histogram | | Difference between the timestamp in the proposal message and the local time of the validator at the time it received the message | +| consensus_vote_extension_receive_count | Counter | status | Number of vote extensions received | +| consensus_proposal_receive_count | Counter | status | Total number of proposals received by the node since process start | +| consensus_proposal_create_count | Counter | | Total number of proposals created by the node since process start | +| consensus_round_voting_power_percent | Gauge | vote_type | A value between 0 and 1.0 representing the percentage of the total voting power per vote type received within a round | +| consensus_late_votes | Counter | vote_type | Number of votes received by the node since process start that correspond to earlier heights and rounds than this node is currently in. | +| evidence_pool_num_evidence | Gauge | | Number of evidence in the evidence pool | +| p2p_peers | Gauge | | Number of peers node's connected to | +| p2p_peer_receive_bytes_total | Counter | peer_id, chID | number of bytes per channel received from a given peer | +| p2p_peer_send_bytes_total | Counter | peer_id, chID | number of bytes per channel sent to a given peer | +| p2p_peer_pending_send_bytes | Gauge | peer_id | number of pending bytes to be sent to a given peer | +| p2p_num_txs | Gauge | peer_id | number of transactions submitted by each peer_id | +| p2p_pending_send_bytes | Gauge | peer_id | amount of data pending to be sent to peer | +| mempool_size | Gauge | | Number of uncommitted transactions | +| mempool_tx_size_bytes | Histogram | | transaction sizes in bytes | +| mempool_failed_txs | Counter | | number of failed transactions | +| mempool_recheck_times | Counter | | number of transactions rechecked in the mempool | +| state_block_processing_time | Histogram | | time between BeginBlock and EndBlock in ms | +| state_consensus_param_updates | Counter | | number of consensus parameter updates returned by the application since process start | +| state_validator_set_updates | Counter | | number of validator set updates returned by the application since process start | ## Useful queries diff --git a/internal/consensus/metrics.go b/internal/consensus/metrics.go index ed31ec636..e5c0162f4 100644 --- a/internal/consensus/metrics.go +++ b/internal/consensus/metrics.go @@ -8,6 +8,7 @@ import ( "github.com/go-kit/kit/metrics/discard" cstypes "github.com/tendermint/tendermint/internal/consensus/types" + tmproto "github.com/tendermint/tendermint/proto/tendermint/types" "github.com/tendermint/tendermint/types" prometheus "github.com/go-kit/kit/metrics/prometheus" @@ -103,6 +104,33 @@ type Metrics struct { // the proposal message and the local time of the validator at the time // that the validator received the message. ProposalTimestampDifference metrics.Histogram + + // VoteExtensionReceiveCount is the number of vote extensions received by this + // node. The metric is annotated by the status of the vote extension from the + // application, either 'accepted' or 'rejected'. + VoteExtensionReceiveCount metrics.Counter + + // ProposalReceiveCount is the total number of proposals received by this node + // since process start. + // The metric is annotated by the status of the proposal from the application, + // either 'accepted' or 'rejected'. + ProposalReceiveCount metrics.Counter + + // ProposalCreationCount is the total number of proposals created by this node + // since process start. + // The metric is annotated by the status of the proposal from the application, + // either 'accepted' or 'rejected'. + ProposalCreateCount metrics.Counter + + // RoundVotingPowerPercent is the percentage of the total voting power received + // with a round. The value begins at 0 for each round and approaches 1.0 as + // additional voting power is observed. The metric is labeled by vote type. + RoundVotingPowerPercent metrics.Gauge + + // LateVotes stores the number of votes that were received by this node that + // correspond to earlier heights and rounds than this node is currently + // in. + LateVotes metrics.Counter } // PrometheusMetrics returns Metrics build using Prometheus client library. @@ -280,6 +308,43 @@ func PrometheusMetrics(namespace string, labelsAndValues ...string) *Metrics { "Only calculated when a new block is proposed.", Buckets: []float64{-10, -.5, -.025, 0, .1, .5, 1, 1.5, 2, 10}, }, append(labels, "is_timely")).With(labelsAndValues...), + VoteExtensionReceiveCount: prometheus.NewCounterFrom(stdprometheus.CounterOpts{ + Namespace: namespace, + Subsystem: MetricsSubsystem, + Name: "vote_extension_receive_count", + Help: "Number of vote extensions received by the node since process start, labeled by " + + "the application's response to VerifyVoteExtension, either accept or reject.", + }, append(labels, "status")).With(labelsAndValues...), + + ProposalReceiveCount: prometheus.NewCounterFrom(stdprometheus.CounterOpts{ + Namespace: namespace, + Subsystem: MetricsSubsystem, + Name: "proposal_receive_count", + Help: "Number of vote proposals received by the node since process start, labeled by " + + "the application's response to ProcessProposal, either accept or reject.", + }, append(labels, "status")).With(labelsAndValues...), + + ProposalCreateCount: prometheus.NewCounterFrom(stdprometheus.CounterOpts{ + Namespace: namespace, + Subsystem: MetricsSubsystem, + Name: "proposal_create_count", + Help: "Number of proposals created by the node since process start.", + }, labels).With(labelsAndValues...), + + RoundVotingPowerPercent: prometheus.NewGaugeFrom(stdprometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: MetricsSubsystem, + Name: "round_voting_power_percent", + Help: "Percentage of the total voting power received with a round. " + + "The value begins at 0 for each round and approaches 1.0 as additional " + + "voting power is observed.", + }, append(labels, "vote_type")).With(labelsAndValues...), + LateVotes: prometheus.NewCounterFrom(stdprometheus.CounterOpts{ + Namespace: namespace, + Subsystem: MetricsSubsystem, + Name: "late_votes", + Help: "Number of votes received by the node since process start that correspond to earlier heights and rounds than this node is currently in.", + }, append(labels, "vote_type")).With(labelsAndValues...), } } @@ -317,6 +382,11 @@ func NopMetrics() *Metrics { QuorumPrevoteDelay: discard.NewGauge(), FullPrevoteDelay: discard.NewGauge(), ProposalTimestampDifference: discard.NewHistogram(), + VoteExtensionReceiveCount: discard.NewCounter(), + ProposalReceiveCount: discard.NewCounter(), + ProposalCreateCount: discard.NewCounter(), + RoundVotingPowerPercent: discard.NewGauge(), + LateVotes: discard.NewCounter(), } } @@ -336,10 +406,45 @@ func (m *Metrics) MarkBlockGossipComplete() { m.BlockGossipReceiveLatency.Observe(time.Since(m.blockGossipStart).Seconds()) } +func (m *Metrics) MarkProposalProcessed(accepted bool) { + status := "accepted" + if !accepted { + status = "rejected" + } + m.ProposalReceiveCount.With("status", status).Add(1) +} + +func (m *Metrics) MarkVoteExtensionReceived(accepted bool) { + status := "accepted" + if !accepted { + status = "rejected" + } + m.VoteExtensionReceiveCount.With("status", status).Add(1) +} + +func (m *Metrics) MarkVoteReceived(vt tmproto.SignedMsgType, power, totalPower int64) { + p := float64(power) / float64(totalPower) + n := strings.ToLower(strings.TrimPrefix(vt.String(), "SIGNED_MSG_TYPE_")) + m.RoundVotingPowerPercent.With("vote_type", n).Add(p) +} + func (m *Metrics) MarkRound(r int32, st time.Time) { m.Rounds.Set(float64(r)) roundTime := time.Since(st).Seconds() m.RoundDuration.Observe(roundTime) + + pvt := tmproto.PrevoteType + pvn := strings.ToLower(strings.TrimPrefix(pvt.String(), "SIGNED_MSG_TYPE_")) + m.RoundVotingPowerPercent.With("vote_type", pvn).Set(0) + + pct := tmproto.PrecommitType + pcn := strings.ToLower(strings.TrimPrefix(pct.String(), "SIGNED_MSG_TYPE_")) + m.RoundVotingPowerPercent.With("vote_type", pcn).Set(0) +} + +func (m *Metrics) MarkLateVote(vt tmproto.SignedMsgType) { + n := strings.ToLower(strings.TrimPrefix(vt.String(), "SIGNED_MSG_TYPE_")) + m.LateVotes.With("vote_type", n).Add(1) } func (m *Metrics) MarkStep(s cstypes.RoundStepType) { diff --git a/internal/consensus/state.go b/internal/consensus/state.go index 490801ad2..90efbab77 100644 --- a/internal/consensus/state.go +++ b/internal/consensus/state.go @@ -1334,6 +1334,7 @@ func (cs *State) defaultDecideProposal(ctx context.Context, height int64, round } else if block == nil { return } + cs.metrics.ProposalCreateCount.Add(1) blockParts, err = block.MakePartSet(types.BlockPartSizeBytes) if err != nil { cs.logger.Error("unable to create proposal block part set", "error", err) @@ -1531,6 +1532,7 @@ func (cs *State) defaultDoPrevote(ctx context.Context, height int64, round int32 if err != nil { panic(fmt.Sprintf("ProcessProposal: %v", err)) } + cs.metrics.MarkProposalProcessed(isAppValid) // Vote nil if the Application rejected the block if !isAppValid { @@ -2297,6 +2299,10 @@ func (cs *State) addVote( "cs_height", cs.Height, ) + if vote.Height < cs.Height || (vote.Height == cs.Height && vote.Round < cs.Round) { + cs.metrics.MarkLateVote(vote.Type) + } + // A precommit for the previous height? // These come in while we wait timeoutCommit if vote.Height+1 == cs.Height && vote.Type == tmproto.PrecommitType { @@ -2337,7 +2343,9 @@ func (cs *State) addVote( // Verify VoteExtension if precommit if vote.Type == tmproto.PrecommitType { - if err = cs.blockExec.VerifyVoteExtension(ctx, vote); err != nil { + err := cs.blockExec.VerifyVoteExtension(ctx, vote) + cs.metrics.MarkVoteExtensionReceived(err == nil) + if err != nil { return false, err } } @@ -2348,6 +2356,11 @@ func (cs *State) addVote( // Either duplicate, or error upon cs.Votes.AddByIndex() return } + if vote.Round == cs.Round { + vals := cs.state.Validators + _, val := vals.GetByIndex(vote.ValidatorIndex) + cs.metrics.MarkVoteReceived(vote.Type, val.VotingPower, vals.TotalVotingPower()) + } if err := cs.eventBus.PublishEventVote(types.EventDataVote{Vote: vote}); err != nil { return added, err diff --git a/internal/state/execution.go b/internal/state/execution.go index 06dfc0b5c..cfacb816d 100644 --- a/internal/state/execution.go +++ b/internal/state/execution.go @@ -247,6 +247,10 @@ func (blockExec *BlockExecutor) ApplyBlock( } if len(validatorUpdates) > 0 { blockExec.logger.Debug("updates to validators", "updates", types.ValidatorListString(validatorUpdates)) + blockExec.metrics.ValidatorSetUpdates.Add(1) + } + if finalizeBlockResponse.ConsensusParamUpdates != nil { + blockExec.metrics.ConsensusParamUpdates.Add(1) } // Update the state with the block and responses. diff --git a/internal/state/metrics.go b/internal/state/metrics.go index bcd713f5f..1d4a13b94 100644 --- a/internal/state/metrics.go +++ b/internal/state/metrics.go @@ -17,6 +17,14 @@ const ( type Metrics struct { // Time between BeginBlock and EndBlock. BlockProcessingTime metrics.Histogram + + // ConsensusParamUpdates is the total number of times the application has + // udated the consensus params since process start. + ConsensusParamUpdates metrics.Counter + + // ValidatorSetUpdates is the total number of times the application has + // udated the validator set since process start. + ValidatorSetUpdates metrics.Counter } // PrometheusMetrics returns Metrics build using Prometheus client library. @@ -35,12 +43,29 @@ func PrometheusMetrics(namespace string, labelsAndValues ...string) *Metrics { Help: "Time between BeginBlock and EndBlock in ms.", Buckets: stdprometheus.LinearBuckets(1, 10, 10), }, labels).With(labelsAndValues...), + ConsensusParamUpdates: prometheus.NewCounterFrom(stdprometheus.CounterOpts{ + Namespace: namespace, + Subsystem: MetricsSubsystem, + Name: "consensus_param_updates", + Help: "The total number of times the application as updated the consensus " + + "parameters since process start.", + }, labels).With(labelsAndValues...), + + ValidatorSetUpdates: prometheus.NewCounterFrom(stdprometheus.CounterOpts{ + Namespace: namespace, + Subsystem: MetricsSubsystem, + Name: "validator_set_updates", + Help: "The total number of times the application as updated the validator " + + "set since process start.", + }, labels).With(labelsAndValues...), } } // NopMetrics returns no-op Metrics. func NopMetrics() *Metrics { return &Metrics{ - BlockProcessingTime: discard.NewHistogram(), + BlockProcessingTime: discard.NewHistogram(), + ConsensusParamUpdates: discard.NewCounter(), + ValidatorSetUpdates: discard.NewCounter(), } }