Files
tendermint/proxy/multi_app_conn.go
William Banfield b92a19b2ce proxy: add initial set of abci metrics (port of #7115) (#9169)
* internal/proxy: add initial set of abci metrics (#7115)

This PR adds an initial set of metrics for use ABCI. The initial metrics enable the calculation of timing histograms and call counts for each of the ABCI methods. The metrics are also labeled as either 'sync' or 'async' to determine if the method call was performed using ABCI's `*Async` methods.

An example of these metrics is included here for reference:
```
tendermint_abci_connection_method_timing_bucket{chain_id="ci",method="commit",type="sync",le="0.0001"} 0
tendermint_abci_connection_method_timing_bucket{chain_id="ci",method="commit",type="sync",le="0.0004"} 5
tendermint_abci_connection_method_timing_bucket{chain_id="ci",method="commit",type="sync",le="0.002"} 12
tendermint_abci_connection_method_timing_bucket{chain_id="ci",method="commit",type="sync",le="0.009"} 13
tendermint_abci_connection_method_timing_bucket{chain_id="ci",method="commit",type="sync",le="0.02"} 13
tendermint_abci_connection_method_timing_bucket{chain_id="ci",method="commit",type="sync",le="0.1"} 13
tendermint_abci_connection_method_timing_bucket{chain_id="ci",method="commit",type="sync",le="0.65"} 13
tendermint_abci_connection_method_timing_bucket{chain_id="ci",method="commit",type="sync",le="2"} 13
tendermint_abci_connection_method_timing_bucket{chain_id="ci",method="commit",type="sync",le="6"} 13
tendermint_abci_connection_method_timing_bucket{chain_id="ci",method="commit",type="sync",le="25"} 13
tendermint_abci_connection_method_timing_bucket{chain_id="ci",method="commit",type="sync",le="+Inf"} 13
tendermint_abci_connection_method_timing_sum{chain_id="ci",method="commit",type="sync"} 0.007802058000000001
tendermint_abci_connection_method_timing_count{chain_id="ci",method="commit",type="sync"} 13
```

These metrics can easily be graphed using prometheus's `histogram_quantile(...)` method to pick out a particular quantile to graph or examine. I chose buckets that were somewhat of an estimate of expected range of times for ABCI operations. They start at .0001 seconds and range to 25 seconds. The hope is that this range captures enough possible times to be useful for us and operators.

* fixup

* fixups

* docs: add abci timing metrics to the metrics docs (#7311)

* format table
2022-08-05 13:29:00 -04:00

194 lines
5.0 KiB
Go

package proxy
import (
"fmt"
abcicli "github.com/tendermint/tendermint/abci/client"
tmlog "github.com/tendermint/tendermint/libs/log"
tmos "github.com/tendermint/tendermint/libs/os"
"github.com/tendermint/tendermint/libs/service"
)
const (
connConsensus = "consensus"
connMempool = "mempool"
connQuery = "query"
connSnapshot = "snapshot"
)
// AppConns is the Tendermint's interface to the application that consists of
// multiple connections.
type AppConns interface {
service.Service
// Mempool connection
Mempool() AppConnMempool
// Consensus connection
Consensus() AppConnConsensus
// Query connection
Query() AppConnQuery
// Snapshot connection
Snapshot() AppConnSnapshot
}
// NewAppConns calls NewMultiAppConn.
func NewAppConns(clientCreator ClientCreator, metrics *Metrics) AppConns {
return NewMultiAppConn(clientCreator, metrics)
}
// multiAppConn implements AppConns.
//
// A multiAppConn is made of a few appConns and manages their underlying abci
// clients.
// TODO: on app restart, clients must reboot together
type multiAppConn struct {
service.BaseService
metrics *Metrics
consensusConn AppConnConsensus
mempoolConn AppConnMempool
queryConn AppConnQuery
snapshotConn AppConnSnapshot
consensusConnClient abcicli.Client
mempoolConnClient abcicli.Client
queryConnClient abcicli.Client
snapshotConnClient abcicli.Client
clientCreator ClientCreator
}
// NewMultiAppConn makes all necessary abci connections to the application.
func NewMultiAppConn(clientCreator ClientCreator, metrics *Metrics) AppConns {
multiAppConn := &multiAppConn{
metrics: metrics,
clientCreator: clientCreator,
}
multiAppConn.BaseService = *service.NewBaseService(nil, "multiAppConn", multiAppConn)
return multiAppConn
}
func (app *multiAppConn) Mempool() AppConnMempool {
return app.mempoolConn
}
func (app *multiAppConn) Consensus() AppConnConsensus {
return app.consensusConn
}
func (app *multiAppConn) Query() AppConnQuery {
return app.queryConn
}
func (app *multiAppConn) Snapshot() AppConnSnapshot {
return app.snapshotConn
}
func (app *multiAppConn) OnStart() error {
c, err := app.abciClientFor(connQuery)
if err != nil {
return err
}
app.queryConnClient = c
app.queryConn = NewAppConnQuery(c, app.metrics)
c, err = app.abciClientFor(connSnapshot)
if err != nil {
app.stopAllClients()
return err
}
app.snapshotConnClient = c
app.snapshotConn = NewAppConnSnapshot(c, app.metrics)
c, err = app.abciClientFor(connMempool)
if err != nil {
app.stopAllClients()
return err
}
app.mempoolConnClient = c
app.mempoolConn = NewAppConnMempool(c, app.metrics)
c, err = app.abciClientFor(connConsensus)
if err != nil {
app.stopAllClients()
return err
}
app.consensusConnClient = c
app.consensusConn = NewAppConnConsensus(c, app.metrics)
// Kill Tendermint if the ABCI application crashes.
go app.killTMOnClientError()
return nil
}
func (app *multiAppConn) OnStop() {
app.stopAllClients()
}
func (app *multiAppConn) killTMOnClientError() {
killFn := func(conn string, err error, logger tmlog.Logger) {
logger.Error(
fmt.Sprintf("%s connection terminated. Did the application crash? Please restart tendermint", conn),
"err", err)
killErr := tmos.Kill()
if killErr != nil {
logger.Error("Failed to kill this process - please do so manually", "err", killErr)
}
}
select {
case <-app.consensusConnClient.Quit():
if err := app.consensusConnClient.Error(); err != nil {
killFn(connConsensus, err, app.Logger)
}
case <-app.mempoolConnClient.Quit():
if err := app.mempoolConnClient.Error(); err != nil {
killFn(connMempool, err, app.Logger)
}
case <-app.queryConnClient.Quit():
if err := app.queryConnClient.Error(); err != nil {
killFn(connQuery, err, app.Logger)
}
case <-app.snapshotConnClient.Quit():
if err := app.snapshotConnClient.Error(); err != nil {
killFn(connSnapshot, err, app.Logger)
}
}
}
func (app *multiAppConn) stopAllClients() {
if app.consensusConnClient != nil {
if err := app.consensusConnClient.Stop(); err != nil {
app.Logger.Error("error while stopping consensus client", "error", err)
}
}
if app.mempoolConnClient != nil {
if err := app.mempoolConnClient.Stop(); err != nil {
app.Logger.Error("error while stopping mempool client", "error", err)
}
}
if app.queryConnClient != nil {
if err := app.queryConnClient.Stop(); err != nil {
app.Logger.Error("error while stopping query client", "error", err)
}
}
if app.snapshotConnClient != nil {
if err := app.snapshotConnClient.Stop(); err != nil {
app.Logger.Error("error while stopping snapshot client", "error", err)
}
}
}
func (app *multiAppConn) abciClientFor(conn string) (abcicli.Client, error) {
c, err := app.clientCreator.NewABCIClient()
if err != nil {
return nil, fmt.Errorf("error creating ABCI client (%s connection): %w", conn, err)
}
c.SetLogger(app.Logger.With("module", "abci-client", "connection", conn))
if err := c.Start(); err != nil {
return nil, fmt.Errorf("error starting ABCI client (%s connection): %w", conn, err)
}
return c, nil
}