diff --git a/consensus/metrics.go b/consensus/metrics.go index 527d53744..48d1a603b 100644 --- a/consensus/metrics.go +++ b/consensus/metrics.go @@ -1,8 +1,13 @@ package consensus import ( + "strings" + "time" + "github.com/go-kit/kit/metrics" "github.com/go-kit/kit/metrics/discard" + cstypes "github.com/tendermint/tendermint/consensus/types" + "github.com/tendermint/tendermint/types" prometheus "github.com/go-kit/kit/metrics/prometheus" stdprometheus "github.com/prometheus/client_golang/prometheus" @@ -25,6 +30,9 @@ type Metrics struct { // Number of rounds. Rounds metrics.Gauge + // Histogram of round duration. + RoundDuration metrics.Histogram + // Number of validators. Validators metrics.Gauge // Total power of all validators. @@ -61,6 +69,14 @@ type Metrics struct { // Number of blockparts transmitted by peer. BlockParts metrics.Counter + // Histogram of step duration. + StepDuration metrics.Histogram + stepStart time.Time + + // Number of block parts received by the node, separated by whether the part + // was relevant to the block the node is trying to gather or not. + BlockGossipPartsReceived metrics.Counter + // QuroumPrevoteMessageDelay is the interval in seconds between the proposal // timestamp and the timestamp of the earliest prevote that achieved a quorum // during the prevote step. @@ -99,7 +115,13 @@ func PrometheusMetrics(namespace string, labelsAndValues ...string) *Metrics { Name: "rounds", Help: "Number of rounds.", }, labels).With(labelsAndValues...), - + RoundDuration: prometheus.NewHistogramFrom(stdprometheus.HistogramOpts{ + Namespace: namespace, + Subsystem: MetricsSubsystem, + Name: "round_duration_seconds", + Help: "Time spent in a round.", + Buckets: stdprometheus.ExponentialBucketsRange(0.1, 100, 8), + }, labels).With(labelsAndValues...), Validators: prometheus.NewGaugeFrom(stdprometheus.GaugeOpts{ Namespace: namespace, Subsystem: MetricsSubsystem, @@ -202,6 +224,20 @@ func PrometheusMetrics(namespace string, labelsAndValues ...string) *Metrics { Name: "block_parts", Help: "Number of blockparts transmitted by peer.", }, append(labels, "peer_id")).With(labelsAndValues...), + BlockGossipPartsReceived: prometheus.NewCounterFrom(stdprometheus.CounterOpts{ + Namespace: namespace, + Subsystem: MetricsSubsystem, + Name: "block_gossip_parts_received", + Help: "Number of block parts received by the node, labeled by whether the " + + "part was relevant to the block the node was currently gathering or not.", + }, append(labels, "matches_current")).With(labelsAndValues...), + StepDuration: prometheus.NewHistogramFrom(stdprometheus.HistogramOpts{ + Namespace: namespace, + Subsystem: MetricsSubsystem, + Name: "step_duration_seconds", + Help: "Time spent per step.", + Buckets: stdprometheus.ExponentialBucketsRange(0.1, 100, 8), + }, append(labels, "step")).With(labelsAndValues...), QuorumPrevoteMessageDelay: prometheus.NewGaugeFrom(stdprometheus.GaugeOpts{ Namespace: namespace, Subsystem: MetricsSubsystem, @@ -226,7 +262,9 @@ func NopMetrics() *Metrics { ValidatorLastSignedHeight: discard.NewGauge(), - Rounds: discard.NewGauge(), + Rounds: discard.NewGauge(), + RoundDuration: discard.NewHistogram(), + StepDuration: discard.NewHistogram(), Validators: discard.NewGauge(), ValidatorsPower: discard.NewGauge(), @@ -246,7 +284,31 @@ func NopMetrics() *Metrics { FastSyncing: discard.NewGauge(), StateSyncing: discard.NewGauge(), BlockParts: discard.NewCounter(), + BlockGossipPartsReceived: discard.NewCounter(), QuorumPrevoteMessageDelay: discard.NewGauge(), FullPrevoteMessageDelay: discard.NewGauge(), } } + +// RecordConsMetrics uses for recording the block related metrics during fast-sync. +func (m *Metrics) RecordConsMetrics(block *types.Block) { + m.NumTxs.Set(float64(len(block.Data.Txs))) + m.TotalTxs.Add(float64(len(block.Data.Txs))) + m.BlockSizeBytes.Set(float64(block.Size())) + m.CommittedHeight.Set(float64(block.Height)) +} + +func (m *Metrics) MarkRound(r int32, st time.Time) { + m.Rounds.Set(float64(r)) + roundTime := time.Since(st).Seconds() + m.RoundDuration.Observe(roundTime) +} + +func (m *Metrics) MarkStep(s cstypes.RoundStepType) { + if !m.stepStart.IsZero() { + stepTime := time.Since(m.stepStart).Seconds() + stepName := strings.TrimPrefix(s.String(), "RoundStep") + m.StepDuration.With("step", stepName).Observe(stepTime) + } + m.stepStart = time.Now() +} diff --git a/consensus/state.go b/consensus/state.go index 2eee62166..9823192bf 100644 --- a/consensus/state.go +++ b/consensus/state.go @@ -4,7 +4,7 @@ import ( "bytes" "errors" "fmt" - "io/ioutil" + "io" "os" "runtime/debug" "sort" @@ -523,6 +523,14 @@ func (cs *State) updateHeight(height int64) { } func (cs *State) updateRoundStep(round int32, step cstypes.RoundStepType) { + if !cs.replayMode { + if round != cs.Round || round == 0 && step == cstypes.RoundStepNewRound { + cs.metrics.MarkRound(cs.Round, cs.StartTime) + } + if cs.Step != step { + cs.metrics.MarkStep(cs.Step) + } + } cs.Round = round cs.Step = step } @@ -1021,9 +1029,6 @@ func (cs *State) enterNewRound(height int64, round int32) { if err := cs.eventBus.PublishEventNewRound(cs.NewRoundEvent()); err != nil { cs.Logger.Error("failed publishing new round", "err", err) } - - cs.metrics.Rounds.Set(float64(round)) - // Wait for txs to be available in the mempool // before we enterPropose in round 0. If the last block changed the app hash, // we may need an empty "proof" block, and enterPropose immediately. @@ -1854,11 +1859,13 @@ func (cs *State) addProposalBlockPart(msg *BlockPartMessage, peerID p2p.ID) (add // Blocks might be reused, so round mismatch is OK if cs.Height != height { cs.Logger.Debug("received block part from wrong height", "height", height, "round", round) + cs.metrics.BlockGossipPartsReceived.With("matches_current", "false").Add(1) return false, nil } // We're not expecting a block part. if cs.ProposalBlockParts == nil { + cs.metrics.BlockGossipPartsReceived.With("matches_current", "false").Add(1) // NOTE: this can happen when we've gone to a higher round and // then receive parts from the previous round - not necessarily a bad peer. cs.Logger.Debug( @@ -1873,15 +1880,21 @@ func (cs *State) addProposalBlockPart(msg *BlockPartMessage, peerID p2p.ID) (add added, err = cs.ProposalBlockParts.AddPart(part) if err != nil { + if errors.Is(err, types.ErrPartSetInvalidProof) || errors.Is(err, types.ErrPartSetUnexpectedIndex) { + cs.metrics.BlockGossipPartsReceived.With("matches_current", "false").Add(1) + } return added, err } + + cs.metrics.BlockGossipPartsReceived.With("matches_current", "true").Add(1) + if cs.ProposalBlockParts.ByteSize() > cs.state.ConsensusParams.Block.MaxBytes { return added, fmt.Errorf("total size of proposal block parts exceeds maximum block bytes (%d > %d)", cs.ProposalBlockParts.ByteSize(), cs.state.ConsensusParams.Block.MaxBytes, ) } if added && cs.ProposalBlockParts.IsComplete() { - bz, err := ioutil.ReadAll(cs.ProposalBlockParts.GetReader()) + bz, err := io.ReadAll(cs.ProposalBlockParts.GetReader()) if err != nil { return added, err } diff --git a/docs/tendermint-core/metrics.md b/docs/tendermint-core/metrics.md index 562c0cb95..b0f7033ac 100644 --- a/docs/tendermint-core/metrics.md +++ b/docs/tendermint-core/metrics.md @@ -18,39 +18,42 @@ Listen address can be changed in the config file (see The following metrics are available: -| **Name** | **Type** | **Tags** | **Description** | -|----------------------------------------|-----------|---------------|------------------------------------------------------------------------| -| abci_connection_method_timing_seconds | Histogram | method, type | Timings for each of the ABCI methods | -| consensus_height | Gauge | | Height of the chain | -| consensus_validators | Gauge | | Number of validators | -| consensus_validators_power | Gauge | | Total voting power of all validators | -| consensus_validator_power | Gauge | | Voting power of the node if in the validator set | -| consensus_validator_last_signed_height | Gauge | | Last height the node signed a block, if the node is a validator | -| consensus_validator_missed_blocks | Gauge | | Total amount of blocks missed for the node, if the node is a validator | -| consensus_missing_validators | Gauge | | Number of validators who did not sign | -| consensus_missing_validators_power | Gauge | | Total voting power of the missing validators | -| consensus_byzantine_validators | Gauge | | Number of validators who tried to double sign | -| consensus_byzantine_validators_power | Gauge | | Total voting power of the byzantine validators | -| consensus_block_interval_seconds | Histogram | | Time between this and last block (Block.Header.Time) in seconds | -| consensus_rounds | Gauge | | Number of rounds | -| consensus_num_txs | Gauge | | Number of transactions | -| consensus_total_txs | Gauge | | Total number of transactions committed | -| consensus_block_parts | counter | peer_id | number of blockparts transmitted by peer | -| consensus_latest_block_height | gauge | | /status sync_info number | -| consensus_fast_syncing | gauge | | either 0 (not fast syncing) or 1 (syncing) | -| consensus_state_syncing | gauge | | either 0 (not state syncing) or 1 (syncing) | -| consensus_block_size_bytes | Gauge | | Block size in bytes | -| p2p_peers | Gauge | | Number of peers node's connected to | -| p2p_peer_receive_bytes_total | counter | peer_id, chID | number of bytes per channel received from a given peer | -| p2p_peer_send_bytes_total | counter | peer_id, chID | number of bytes per channel sent to a given peer | -| p2p_peer_pending_send_bytes | gauge | peer_id | number of pending bytes to be sent to a given peer | -| p2p_num_txs | gauge | peer_id | number of transactions submitted by each peer_id | -| p2p_pending_send_bytes | gauge | peer_id | amount of data pending to be sent to peer | -| mempool_size | Gauge | | Number of uncommitted transactions | -| mempool_tx_size_bytes | histogram | | transaction sizes in bytes | -| mempool_failed_txs | counter | | number of failed transactions | -| mempool_recheck_times | counter | | number of transactions rechecked in the mempool | -| state_block_processing_time | histogram | | time between BeginBlock and EndBlock in ms | +| **Name** | **Type** | **Tags** | **Description** | +|----------------------------------------|-----------|-----------------|--------------------------------------------------------------------------------------------------------------------------------------------| +| abci_connection_method_timing_seconds | Histogram | method, type | Timings for each of the ABCI methods | +| consensus_height | Gauge | | Height of the chain | +| consensus_validators | Gauge | | Number of validators | +| consensus_validators_power | Gauge | | Total voting power of all validators | +| consensus_validator_power | Gauge | | Voting power of the node if in the validator set | +| consensus_validator_last_signed_height | Gauge | | Last height the node signed a block, if the node is a validator | +| consensus_validator_missed_blocks | Gauge | | Total amount of blocks missed for the node, if the node is a validator | +| consensus_missing_validators | Gauge | | Number of validators who did not sign | +| consensus_missing_validators_power | Gauge | | Total voting power of the missing validators | +| consensus_byzantine_validators | Gauge | | Number of validators who tried to double sign | +| consensus_byzantine_validators_power | Gauge | | Total voting power of the byzantine validators | +| consensus_block_interval_seconds | Histogram | | Time between this and last block (Block.Header.Time) in seconds | +| consensus_rounds | Gauge | | Number of rounds | +| consensus_num_txs | Gauge | | Number of transactions | +| consensus_total_txs | Gauge | | Total number of transactions committed | +| consensus_block_parts | counter | peer_id | number of blockparts transmitted by peer | +| consensus_latest_block_height | gauge | | /status sync_info number | +| consensus_fast_syncing | gauge | | either 0 (not fast syncing) or 1 (syncing) | +| consensus_state_syncing | gauge | | either 0 (not state syncing) or 1 (syncing) | +| consensus_block_size_bytes | Gauge | | Block size in bytes | +| consensus_step_duration | Histogram | step | Histogram of durations for each step in the consensus protocol | +| consensus_round_duration | Histogram | | Histogram of durations for all the rounds that have occurred since the process started | +| consensus_block_gossip_parts_received | Counter | matches_current | Number of block parts received by the node | +| p2p_peers | Gauge | | Number of peers node's connected to | +| p2p_peer_receive_bytes_total | counter | peer_id, chID | number of bytes per channel received from a given peer | +| p2p_peer_send_bytes_total | counter | peer_id, chID | number of bytes per channel sent to a given peer | +| p2p_peer_pending_send_bytes | gauge | peer_id | number of pending bytes to be sent to a given peer | +| p2p_num_txs | gauge | peer_id | number of transactions submitted by each peer_id | +| p2p_pending_send_bytes | gauge | peer_id | amount of data pending to be sent to peer | +| mempool_size | Gauge | | Number of uncommitted transactions | +| mempool_tx_size_bytes | histogram | | transaction sizes in bytes | +| mempool_failed_txs | counter | | number of failed transactions | +| mempool_recheck_times | counter | | number of transactions rechecked in the mempool | +| state_block_processing_time | histogram | | time between BeginBlock and EndBlock in ms | ## Useful queries