Files
tendermint/internal/consensus/metrics.go
William Banfield 36a1acff52 internal/proxy: add initial set of abci metrics (#7115)
This PR adds an initial set of metrics for use ABCI. The initial metrics enable the calculation of timing histograms and call counts for each of the ABCI methods. The metrics are also labeled as either 'sync' or 'async' to determine if the method call was performed using ABCI's `*Async` methods.

An example of these metrics is included here for reference:
```
tendermint_abci_connection_method_timing_bucket{chain_id="ci",method="commit",type="sync",le="0.0001"} 0
tendermint_abci_connection_method_timing_bucket{chain_id="ci",method="commit",type="sync",le="0.0004"} 5
tendermint_abci_connection_method_timing_bucket{chain_id="ci",method="commit",type="sync",le="0.002"} 12
tendermint_abci_connection_method_timing_bucket{chain_id="ci",method="commit",type="sync",le="0.009"} 13
tendermint_abci_connection_method_timing_bucket{chain_id="ci",method="commit",type="sync",le="0.02"} 13
tendermint_abci_connection_method_timing_bucket{chain_id="ci",method="commit",type="sync",le="0.1"} 13
tendermint_abci_connection_method_timing_bucket{chain_id="ci",method="commit",type="sync",le="0.65"} 13
tendermint_abci_connection_method_timing_bucket{chain_id="ci",method="commit",type="sync",le="2"} 13
tendermint_abci_connection_method_timing_bucket{chain_id="ci",method="commit",type="sync",le="6"} 13
tendermint_abci_connection_method_timing_bucket{chain_id="ci",method="commit",type="sync",le="25"} 13
tendermint_abci_connection_method_timing_bucket{chain_id="ci",method="commit",type="sync",le="+Inf"} 13
tendermint_abci_connection_method_timing_sum{chain_id="ci",method="commit",type="sync"} 0.007802058000000001
tendermint_abci_connection_method_timing_count{chain_id="ci",method="commit",type="sync"} 13
```

These metrics can easily be graphed using prometheus's `histogram_quantile(...)` method to pick out a particular quantile to graph or examine. I chose buckets that were somewhat of an estimate of expected range of times for ABCI operations. They start at .0001 seconds and range to 25 seconds. The hope is that this range captures enough possible times to be useful for us and operators.
2021-10-13 20:52:25 +00:00

239 lines
8.4 KiB
Go

package consensus
import (
"github.com/go-kit/kit/metrics"
"github.com/go-kit/kit/metrics/discard"
"github.com/tendermint/tendermint/types"
prometheus "github.com/go-kit/kit/metrics/prometheus"
stdprometheus "github.com/prometheus/client_golang/prometheus"
)
const (
// MetricsSubsystem is a subsystem shared by all metrics exposed by this
// package.
MetricsSubsystem = "consensus"
)
// Metrics contains metrics exposed by this package.
type Metrics struct {
// Height of the chain.
Height metrics.Gauge
// ValidatorLastSignedHeight of a validator.
ValidatorLastSignedHeight metrics.Gauge
// Number of rounds.
Rounds metrics.Gauge
// Number of validators.
Validators metrics.Gauge
// Total power of all validators.
ValidatorsPower metrics.Gauge
// Power of a validator.
ValidatorPower metrics.Gauge
// Amount of blocks missed by a validator.
ValidatorMissedBlocks metrics.Gauge
// Number of validators who did not sign.
MissingValidators metrics.Gauge
// Total power of the missing validators.
MissingValidatorsPower metrics.Gauge
// Number of validators who tried to double sign.
ByzantineValidators metrics.Gauge
// Total power of the byzantine validators.
ByzantineValidatorsPower metrics.Gauge
// Time between this and the last block.
BlockIntervalSeconds metrics.Histogram
// Number of transactions.
NumTxs metrics.Gauge
// Size of the block.
BlockSizeBytes metrics.Histogram
// Total number of transactions.
TotalTxs metrics.Gauge
// The latest block height.
CommittedHeight metrics.Gauge
// Whether or not a node is block syncing. 1 if yes, 0 if no.
BlockSyncing metrics.Gauge
// Whether or not a node is state syncing. 1 if yes, 0 if no.
StateSyncing metrics.Gauge
// Number of blockparts transmitted by peer.
BlockParts metrics.Counter
// Histogram of time taken per step annotated with reason that the step proceeded.
StepTime metrics.Histogram
}
// PrometheusMetrics returns Metrics build using Prometheus client library.
// Optionally, labels can be provided along with their values ("foo",
// "fooValue").
func PrometheusMetrics(namespace string, labelsAndValues ...string) *Metrics {
labels := []string{}
for i := 0; i < len(labelsAndValues); i += 2 {
labels = append(labels, labelsAndValues[i])
}
return &Metrics{
Height: prometheus.NewGaugeFrom(stdprometheus.GaugeOpts{
Namespace: namespace,
Subsystem: MetricsSubsystem,
Name: "height",
Help: "Height of the chain.",
}, labels).With(labelsAndValues...),
Rounds: prometheus.NewGaugeFrom(stdprometheus.GaugeOpts{
Namespace: namespace,
Subsystem: MetricsSubsystem,
Name: "rounds",
Help: "Number of rounds.",
}, labels).With(labelsAndValues...),
Validators: prometheus.NewGaugeFrom(stdprometheus.GaugeOpts{
Namespace: namespace,
Subsystem: MetricsSubsystem,
Name: "validators",
Help: "Number of validators.",
}, labels).With(labelsAndValues...),
ValidatorLastSignedHeight: prometheus.NewGaugeFrom(stdprometheus.GaugeOpts{
Namespace: namespace,
Subsystem: MetricsSubsystem,
Name: "validator_last_signed_height",
Help: "Last signed height for a validator",
}, append(labels, "validator_address")).With(labelsAndValues...),
ValidatorMissedBlocks: prometheus.NewGaugeFrom(stdprometheus.GaugeOpts{
Namespace: namespace,
Subsystem: MetricsSubsystem,
Name: "validator_missed_blocks",
Help: "Total missed blocks for a validator",
}, append(labels, "validator_address")).With(labelsAndValues...),
ValidatorsPower: prometheus.NewGaugeFrom(stdprometheus.GaugeOpts{
Namespace: namespace,
Subsystem: MetricsSubsystem,
Name: "validators_power",
Help: "Total power of all validators.",
}, labels).With(labelsAndValues...),
ValidatorPower: prometheus.NewGaugeFrom(stdprometheus.GaugeOpts{
Namespace: namespace,
Subsystem: MetricsSubsystem,
Name: "validator_power",
Help: "Power of a validator",
}, append(labels, "validator_address")).With(labelsAndValues...),
MissingValidators: prometheus.NewGaugeFrom(stdprometheus.GaugeOpts{
Namespace: namespace,
Subsystem: MetricsSubsystem,
Name: "missing_validators",
Help: "Number of validators who did not sign.",
}, labels).With(labelsAndValues...),
MissingValidatorsPower: prometheus.NewGaugeFrom(stdprometheus.GaugeOpts{
Namespace: namespace,
Subsystem: MetricsSubsystem,
Name: "missing_validators_power",
Help: "Total power of the missing validators.",
}, labels).With(labelsAndValues...),
ByzantineValidators: prometheus.NewGaugeFrom(stdprometheus.GaugeOpts{
Namespace: namespace,
Subsystem: MetricsSubsystem,
Name: "byzantine_validators",
Help: "Number of validators who tried to double sign.",
}, labels).With(labelsAndValues...),
ByzantineValidatorsPower: prometheus.NewGaugeFrom(stdprometheus.GaugeOpts{
Namespace: namespace,
Subsystem: MetricsSubsystem,
Name: "byzantine_validators_power",
Help: "Total power of the byzantine validators.",
}, labels).With(labelsAndValues...),
BlockIntervalSeconds: prometheus.NewHistogramFrom(stdprometheus.HistogramOpts{
Namespace: namespace,
Subsystem: MetricsSubsystem,
Name: "block_interval_seconds",
Help: "Time between this and the last block.",
}, labels).With(labelsAndValues...),
NumTxs: prometheus.NewGaugeFrom(stdprometheus.GaugeOpts{
Namespace: namespace,
Subsystem: MetricsSubsystem,
Name: "num_txs",
Help: "Number of transactions.",
}, labels).With(labelsAndValues...),
BlockSizeBytes: prometheus.NewHistogramFrom(stdprometheus.HistogramOpts{
Namespace: namespace,
Subsystem: MetricsSubsystem,
Name: "block_size_bytes",
Help: "Size of the block.",
}, labels).With(labelsAndValues...),
TotalTxs: prometheus.NewGaugeFrom(stdprometheus.GaugeOpts{
Namespace: namespace,
Subsystem: MetricsSubsystem,
Name: "total_txs",
Help: "Total number of transactions.",
}, labels).With(labelsAndValues...),
CommittedHeight: prometheus.NewGaugeFrom(stdprometheus.GaugeOpts{
Namespace: namespace,
Subsystem: MetricsSubsystem,
Name: "latest_block_height",
Help: "The latest block height.",
}, labels).With(labelsAndValues...),
BlockSyncing: prometheus.NewGaugeFrom(stdprometheus.GaugeOpts{
Namespace: namespace,
Subsystem: MetricsSubsystem,
Name: "block_syncing",
Help: "Whether or not a node is block syncing. 1 if yes, 0 if no.",
}, labels).With(labelsAndValues...),
StateSyncing: prometheus.NewGaugeFrom(stdprometheus.GaugeOpts{
Namespace: namespace,
Subsystem: MetricsSubsystem,
Name: "state_syncing",
Help: "Whether or not a node is state syncing. 1 if yes, 0 if no.",
}, labels).With(labelsAndValues...),
BlockParts: prometheus.NewCounterFrom(stdprometheus.CounterOpts{
Namespace: namespace,
Subsystem: MetricsSubsystem,
Name: "block_parts",
Help: "Number of blockparts transmitted by peer.",
}, append(labels, "peer_id")).With(labelsAndValues...),
StepTime: prometheus.NewHistogramFrom(stdprometheus.HistogramOpts{
Namespace: namespace,
Subsystem: MetricsSubsystem,
Name: "step_time",
Help: "Time spent per step.",
}, append(labels, "step", "reason")).With(labelsAndValues...),
}
}
// NopMetrics returns no-op Metrics.
func NopMetrics() *Metrics {
return &Metrics{
Height: discard.NewGauge(),
ValidatorLastSignedHeight: discard.NewGauge(),
Rounds: discard.NewGauge(),
Validators: discard.NewGauge(),
ValidatorsPower: discard.NewGauge(),
ValidatorPower: discard.NewGauge(),
ValidatorMissedBlocks: discard.NewGauge(),
MissingValidators: discard.NewGauge(),
MissingValidatorsPower: discard.NewGauge(),
ByzantineValidators: discard.NewGauge(),
ByzantineValidatorsPower: discard.NewGauge(),
BlockIntervalSeconds: discard.NewHistogram(),
NumTxs: discard.NewGauge(),
BlockSizeBytes: discard.NewHistogram(),
TotalTxs: discard.NewGauge(),
CommittedHeight: discard.NewGauge(),
BlockSyncing: discard.NewGauge(),
StateSyncing: discard.NewGauge(),
BlockParts: discard.NewCounter(),
}
}
// RecordConsMetrics uses for recording the block related metrics during fast-sync.
func (m *Metrics) RecordConsMetrics(block *types.Block) {
m.NumTxs.Set(float64(len(block.Data.Txs)))
m.TotalTxs.Add(float64(len(block.Data.Txs)))
m.BlockSizeBytes.Observe(float64(block.Size()))
m.CommittedHeight.Set(float64(block.Height))
}