Files
tendermint/node/node.go
M. J. Fromberger c8e8a62084 abci/client: simplify client interface (#7607)
This change has two main effects:

1. Remove most of the Async methods from the abci.Client interface.
   Remaining are FlushAsync, CommitTxAsync, and DeliverTxAsync.

2. Rename the synchronous methods to remove the "Sync" suffix.

The rest of the change is updating the implementations, subsets, and mocks of
the interface, along with the call sites that point to them.

* Fix stringly-typed mock stubs.
* Rename helper method.
2022-01-19 10:58:56 -08:00

788 lines
23 KiB
Go

package node
import (
"context"
"errors"
"fmt"
"net"
"net/http"
"strconv"
"time"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promhttp"
abciclient "github.com/tendermint/tendermint/abci/client"
abci "github.com/tendermint/tendermint/abci/types"
"github.com/tendermint/tendermint/config"
"github.com/tendermint/tendermint/crypto"
"github.com/tendermint/tendermint/internal/blocksync"
"github.com/tendermint/tendermint/internal/consensus"
"github.com/tendermint/tendermint/internal/eventbus"
"github.com/tendermint/tendermint/internal/mempool"
"github.com/tendermint/tendermint/internal/p2p"
"github.com/tendermint/tendermint/internal/p2p/pex"
"github.com/tendermint/tendermint/internal/proxy"
rpccore "github.com/tendermint/tendermint/internal/rpc/core"
sm "github.com/tendermint/tendermint/internal/state"
"github.com/tendermint/tendermint/internal/state/indexer"
"github.com/tendermint/tendermint/internal/statesync"
"github.com/tendermint/tendermint/internal/store"
"github.com/tendermint/tendermint/libs/log"
"github.com/tendermint/tendermint/libs/service"
"github.com/tendermint/tendermint/libs/strings"
tmtime "github.com/tendermint/tendermint/libs/time"
"github.com/tendermint/tendermint/privval"
"github.com/tendermint/tendermint/types"
_ "net/http/pprof" // nolint: gosec // securely exposed on separate, optional port
_ "github.com/lib/pq" // provide the psql db driver
)
// nodeImpl is the highest level interface to a full Tendermint node.
// It includes all configuration information and running services.
type nodeImpl struct {
service.BaseService
logger log.Logger
// config
config *config.Config
genesisDoc *types.GenesisDoc // initial validator set
privValidator types.PrivValidator // local node's validator key
// network
peerManager *p2p.PeerManager
router *p2p.Router
nodeInfo types.NodeInfo
nodeKey types.NodeKey // our node privkey
isListening bool
// services
eventSinks []indexer.EventSink
stateStore sm.Store
blockStore *store.BlockStore // store the blockchain to disk
stateSync bool // whether the node should state sync on startup
stateSyncReactor *statesync.Reactor // for hosting and restoring state sync snapshots
services []service.Service
rpcListeners []net.Listener // rpc servers
shutdownOps closer
rpcEnv *rpccore.Environment
prometheusSrv *http.Server
}
// newDefaultNode returns a Tendermint node with default settings for the
// PrivValidator, ClientCreator, GenesisDoc, and DBProvider.
// It implements NodeProvider.
func newDefaultNode(
ctx context.Context,
cfg *config.Config,
logger log.Logger,
) (service.Service, error) {
nodeKey, err := types.LoadOrGenNodeKey(cfg.NodeKeyFile())
if err != nil {
return nil, fmt.Errorf("failed to load or gen node key %s: %w", cfg.NodeKeyFile(), err)
}
if cfg.Mode == config.ModeSeed {
return makeSeedNode(
ctx,
cfg,
config.DefaultDBProvider,
nodeKey,
defaultGenesisDocProviderFunc(cfg),
logger,
)
}
pval, err := makeDefaultPrivval(cfg)
if err != nil {
return nil, err
}
appClient, _ := proxy.DefaultClientCreator(logger, cfg.ProxyApp, cfg.ABCI, cfg.DBDir())
return makeNode(
ctx,
cfg,
pval,
nodeKey,
appClient,
defaultGenesisDocProviderFunc(cfg),
config.DefaultDBProvider,
logger,
)
}
// makeNode returns a new, ready to go, Tendermint Node.
func makeNode(
ctx context.Context,
cfg *config.Config,
filePrivval *privval.FilePV,
nodeKey types.NodeKey,
clientCreator abciclient.Creator,
genesisDocProvider genesisDocProvider,
dbProvider config.DBProvider,
logger log.Logger,
) (service.Service, error) {
var cancel context.CancelFunc
ctx, cancel = context.WithCancel(ctx)
closers := []closer{convertCancelCloser(cancel)}
blockStore, stateDB, dbCloser, err := initDBs(cfg, dbProvider)
if err != nil {
return nil, combineCloseError(err, dbCloser)
}
closers = append(closers, dbCloser)
stateStore := sm.NewStore(stateDB)
genDoc, err := genesisDocProvider()
if err != nil {
return nil, combineCloseError(err, makeCloser(closers))
}
err = genDoc.ValidateAndComplete()
if err != nil {
return nil, combineCloseError(
fmt.Errorf("error in genesis doc: %w", err),
makeCloser(closers))
}
state, err := loadStateFromDBOrGenesisDocProvider(stateStore, genDoc)
if err != nil {
return nil, combineCloseError(err, makeCloser(closers))
}
nodeMetrics := defaultMetricsProvider(cfg.Instrumentation)(genDoc.ChainID)
// Create the proxyApp and establish connections to the ABCI app (consensus, mempool, query).
proxyApp := proxy.NewAppConns(clientCreator, logger.With("module", "proxy"), nodeMetrics.proxy)
if err := proxyApp.Start(ctx); err != nil {
return nil, fmt.Errorf("error starting proxy app connections: %w", err)
}
// EventBus and IndexerService must be started before the handshake because
// we might need to index the txs of the replayed block as this might not have happened
// when the node stopped last time (i.e. the node stopped after it saved the block
// but before it indexed the txs, or, endblocker panicked)
eventBus := eventbus.NewDefault(logger.With("module", "events"))
if err := eventBus.Start(ctx); err != nil {
return nil, combineCloseError(err, makeCloser(closers))
}
indexerService, eventSinks, err := createAndStartIndexerService(
ctx, cfg, dbProvider, eventBus,
logger, genDoc.ChainID, nodeMetrics.indexer)
if err != nil {
return nil, combineCloseError(err, makeCloser(closers))
}
privValidator, err := createPrivval(ctx, logger, cfg, genDoc, filePrivval)
if err != nil {
return nil, combineCloseError(err, makeCloser(closers))
}
var pubKey crypto.PubKey
if cfg.Mode == config.ModeValidator {
pubKey, err = privValidator.GetPubKey(ctx)
if err != nil {
return nil, combineCloseError(fmt.Errorf("can't get pubkey: %w", err),
makeCloser(closers))
}
if pubKey == nil {
return nil, combineCloseError(
errors.New("could not retrieve public key from private validator"),
makeCloser(closers))
}
}
// Determine whether we should attempt state sync.
stateSync := cfg.StateSync.Enable && !onlyValidatorIsUs(state, pubKey)
if stateSync && state.LastBlockHeight > 0 {
logger.Info("Found local state with non-zero height, skipping state sync")
stateSync = false
}
// Create the handshaker, which calls RequestInfo, sets the AppVersion on the state,
// and replays any blocks as necessary to sync tendermint with the app.
if !stateSync {
if err := consensus.NewHandshaker(
logger.With("module", "handshaker"),
stateStore, state, blockStore, eventBus, genDoc,
).Handshake(ctx, proxyApp); err != nil {
return nil, combineCloseError(err, makeCloser(closers))
}
// Reload the state. It will have the Version.Consensus.App set by the
// Handshake, and may have other modifications as well (ie. depending on
// what happened during block replay).
state, err = stateStore.Load()
if err != nil {
return nil, combineCloseError(
fmt.Errorf("cannot load state: %w", err),
makeCloser(closers))
}
}
// Determine whether we should do block sync. This must happen after the handshake, since the
// app may modify the validator set, specifying ourself as the only validator.
blockSync := !onlyValidatorIsUs(state, pubKey)
logNodeStartupInfo(state, pubKey, logger, cfg.Mode)
// TODO: Fetch and provide real options and do proper p2p bootstrapping.
// TODO: Use a persistent peer database.
nodeInfo, err := makeNodeInfo(cfg, nodeKey, eventSinks, genDoc, state)
if err != nil {
return nil, combineCloseError(err, makeCloser(closers))
}
peerManager, peerCloser, err := createPeerManager(cfg, dbProvider, nodeKey.ID)
closers = append(closers, peerCloser)
if err != nil {
return nil, combineCloseError(
fmt.Errorf("failed to create peer manager: %w", err),
makeCloser(closers))
}
router, err := createRouter(ctx, logger, nodeMetrics.p2p, nodeInfo, nodeKey,
peerManager, cfg, proxyApp)
if err != nil {
return nil, combineCloseError(
fmt.Errorf("failed to create router: %w", err),
makeCloser(closers))
}
mpReactor, mp, err := createMempoolReactor(ctx,
cfg, proxyApp, state, nodeMetrics.mempool, peerManager, router, logger,
)
if err != nil {
return nil, combineCloseError(err, makeCloser(closers))
}
evReactor, evPool, err := createEvidenceReactor(ctx,
cfg, dbProvider, stateDB, blockStore, peerManager, router, logger,
)
if err != nil {
return nil, combineCloseError(err, makeCloser(closers))
}
// make block executor for consensus and blockchain reactors to execute blocks
blockExec := sm.NewBlockExecutor(
stateStore,
logger.With("module", "state"),
proxyApp.Consensus(),
mp,
evPool,
blockStore,
sm.BlockExecutorWithMetrics(nodeMetrics.state),
)
csReactor, csState, err := createConsensusReactor(ctx,
cfg, state, blockExec, blockStore, mp, evPool,
privValidator, nodeMetrics.consensus, stateSync || blockSync, eventBus,
peerManager, router, logger,
)
if err != nil {
return nil, combineCloseError(err, makeCloser(closers))
}
// Create the blockchain reactor. Note, we do not start block sync if we're
// doing a state sync first.
bcReactor, err := blocksync.NewReactor(ctx,
logger.With("module", "blockchain"),
state.Copy(),
blockExec,
blockStore,
csReactor,
router.OpenChannel,
peerManager.Subscribe(ctx),
blockSync && !stateSync,
nodeMetrics.consensus,
eventBus,
)
if err != nil {
return nil, combineCloseError(
fmt.Errorf("could not create blocksync reactor: %w", err),
makeCloser(closers))
}
// Make ConsensusReactor. Don't enable fully if doing a state sync and/or block sync first.
// FIXME We need to update metrics here, since other reactors don't have access to them.
if stateSync {
nodeMetrics.consensus.StateSyncing.Set(1)
} else if blockSync {
nodeMetrics.consensus.BlockSyncing.Set(1)
}
// Set up state sync reactor, and schedule a sync if requested.
// FIXME The way we do phased startups (e.g. replay -> block sync -> consensus) is very messy,
// we should clean this whole thing up. See:
// https://github.com/tendermint/tendermint/issues/4644
stateSyncReactor, err := statesync.NewReactor(
ctx,
genDoc.ChainID,
genDoc.InitialHeight,
*cfg.StateSync,
logger.With("module", "statesync"),
proxyApp.Snapshot(),
proxyApp.Query(),
router.OpenChannel,
peerManager.Subscribe(ctx),
stateStore,
blockStore,
cfg.StateSync.TempDir,
nodeMetrics.statesync,
eventBus,
)
if err != nil {
return nil, combineCloseError(err, makeCloser(closers))
}
var pexReactor service.Service
if cfg.P2P.PexReactor {
pexReactor, err = pex.NewReactor(ctx, logger, peerManager, router.OpenChannel, peerManager.Subscribe(ctx))
if err != nil {
return nil, combineCloseError(err, makeCloser(closers))
}
}
node := &nodeImpl{
config: cfg,
logger: logger,
genesisDoc: genDoc,
privValidator: privValidator,
peerManager: peerManager,
router: router,
nodeInfo: nodeInfo,
nodeKey: nodeKey,
eventSinks: eventSinks,
services: []service.Service{
eventBus,
indexerService,
evReactor,
mpReactor,
csReactor,
bcReactor,
pexReactor,
},
stateStore: stateStore,
blockStore: blockStore,
stateSyncReactor: stateSyncReactor,
stateSync: stateSync,
shutdownOps: makeCloser(closers),
rpcEnv: &rpccore.Environment{
ProxyAppQuery: proxyApp.Query(),
ProxyAppMempool: proxyApp.Mempool(),
StateStore: stateStore,
BlockStore: blockStore,
EvidencePool: evPool,
ConsensusState: csState,
ConsensusReactor: csReactor,
BlockSyncReactor: bcReactor,
PeerManager: peerManager,
GenDoc: genDoc,
EventSinks: eventSinks,
EventBus: eventBus,
Mempool: mp,
Logger: logger.With("module", "rpc"),
Config: *cfg.RPC,
},
}
if cfg.Mode == config.ModeValidator {
node.rpcEnv.PubKey = pubKey
}
node.rpcEnv.P2PTransport = node
node.BaseService = *service.NewBaseService(logger, "Node", node)
return node, nil
}
// OnStart starts the Node. It implements service.Service.
func (n *nodeImpl) OnStart(ctx context.Context) error {
if n.config.RPC.PprofListenAddress != "" {
rpcCtx, rpcCancel := context.WithCancel(ctx)
srv := &http.Server{Addr: n.config.RPC.PprofListenAddress, Handler: nil}
go func() {
select {
case <-ctx.Done():
sctx, scancel := context.WithTimeout(context.Background(), time.Second)
defer scancel()
_ = srv.Shutdown(sctx)
case <-rpcCtx.Done():
}
}()
go func() {
n.logger.Info("Starting pprof server", "laddr", n.config.RPC.PprofListenAddress)
if err := srv.ListenAndServe(); err != nil {
n.logger.Error("pprof server error", "err", err)
rpcCancel()
}
}()
}
now := tmtime.Now()
genTime := n.genesisDoc.GenesisTime
if genTime.After(now) {
n.logger.Info("Genesis time is in the future. Sleeping until then...", "genTime", genTime)
time.Sleep(genTime.Sub(now))
}
// Start the RPC server before the P2P server
// so we can eg. receive txs for the first block
if n.config.RPC.ListenAddress != "" {
var err error
n.rpcListeners, err = n.rpcEnv.StartService(ctx, n.config)
if err != nil {
return err
}
}
if n.config.Instrumentation.Prometheus && n.config.Instrumentation.PrometheusListenAddr != "" {
n.prometheusSrv = n.startPrometheusServer(ctx, n.config.Instrumentation.PrometheusListenAddr)
}
// Start the transport.
if err := n.router.Start(ctx); err != nil {
return err
}
n.isListening = true
for _, reactor := range n.services {
if err := reactor.Start(ctx); err != nil {
if errors.Is(err, service.ErrAlreadyStarted) {
continue
}
return fmt.Errorf("problem starting service '%T': %w ", reactor, err)
}
}
if err := n.stateSyncReactor.Start(ctx); err != nil {
return err
}
// Run state sync
// TODO: We shouldn't run state sync if we already have state that has a
// LastBlockHeight that is not InitialHeight
if n.stateSync {
bcR := n.rpcEnv.BlockSyncReactor
// we need to get the genesis state to get parameters such as
state, err := sm.MakeGenesisState(n.genesisDoc)
if err != nil {
return fmt.Errorf("unable to derive state: %w", err)
}
// TODO: we may want to move these events within the respective
// reactors.
// At the beginning of the statesync start, we use the initialHeight as the event height
// because of the statesync doesn't have the concreate state height before fetched the snapshot.
d := types.EventDataStateSyncStatus{Complete: false, Height: state.InitialHeight}
if err := n.stateSyncReactor.PublishStatus(ctx, d); err != nil {
n.logger.Error("failed to emit the statesync start event", "err", err)
}
// RUN STATE SYNC NOW:
//
// TODO: Eventually this should run as part of some
// separate orchestrator
n.logger.Info("starting state sync")
ssState, err := n.stateSyncReactor.Sync(ctx)
if err != nil {
n.logger.Error("state sync failed; shutting down this node", "err", err)
// stop the node
if err := n.Stop(); err != nil {
n.logger.Error("failed to shut down node", "err", err)
}
return err
}
n.rpcEnv.ConsensusReactor.SetStateSyncingMetrics(0)
if err := n.stateSyncReactor.PublishStatus(ctx,
types.EventDataStateSyncStatus{
Complete: true,
Height: ssState.LastBlockHeight,
}); err != nil {
n.logger.Error("failed to emit the statesync start event", "err", err)
return err
}
// TODO: Some form of orchestrator is needed here between the state
// advancing reactors to be able to control which one of the three
// is running
// FIXME Very ugly to have these metrics bleed through here.
n.rpcEnv.ConsensusReactor.SetBlockSyncingMetrics(1)
if err := bcR.SwitchToBlockSync(ctx, ssState); err != nil {
n.logger.Error("failed to switch to block sync", "err", err)
return err
}
if err := bcR.PublishStatus(ctx,
types.EventDataBlockSyncStatus{
Complete: false,
Height: ssState.LastBlockHeight,
}); err != nil {
n.logger.Error("failed to emit the block sync starting event", "err", err)
return err
}
}
return nil
}
// OnStop stops the Node. It implements service.Service.
func (n *nodeImpl) OnStop() {
n.logger.Info("Stopping Node")
for _, es := range n.eventSinks {
if err := es.Stop(); err != nil {
n.logger.Error("failed to stop event sink", "err", err)
}
}
for _, reactor := range n.services {
reactor.Wait()
}
n.stateSyncReactor.Wait()
n.router.Wait()
n.isListening = false
// finally stop the listeners / external services
for _, l := range n.rpcListeners {
n.logger.Info("Closing rpc listener", "listener", l)
if err := l.Close(); err != nil {
n.logger.Error("error closing listener", "listener", l, "err", err)
}
}
if pvsc, ok := n.privValidator.(service.Service); ok {
pvsc.Wait()
}
if n.prometheusSrv != nil {
if err := n.prometheusSrv.Shutdown(context.Background()); err != nil {
// Error from closing listeners, or context timeout:
n.logger.Error("Prometheus HTTP server Shutdown", "err", err)
}
}
if err := n.shutdownOps(); err != nil {
if strings.TrimSpace(err.Error()) != "" {
n.logger.Error("problem shutting down additional services", "err", err)
}
}
if n.blockStore != nil {
if err := n.blockStore.Close(); err != nil {
n.logger.Error("problem closing blockstore", "err", err)
}
}
if n.stateStore != nil {
if err := n.stateStore.Close(); err != nil {
n.logger.Error("problem closing statestore", "err", err)
}
}
}
// startPrometheusServer starts a Prometheus HTTP server, listening for metrics
// collectors on addr.
func (n *nodeImpl) startPrometheusServer(ctx context.Context, addr string) *http.Server {
srv := &http.Server{
Addr: addr,
Handler: promhttp.InstrumentMetricHandler(
prometheus.DefaultRegisterer, promhttp.HandlerFor(
prometheus.DefaultGatherer,
promhttp.HandlerOpts{MaxRequestsInFlight: n.config.Instrumentation.MaxOpenConnections},
),
),
}
promCtx, promCancel := context.WithCancel(ctx)
go func() {
select {
case <-ctx.Done():
sctx, scancel := context.WithTimeout(context.Background(), time.Second)
defer scancel()
_ = srv.Shutdown(sctx)
case <-promCtx.Done():
}
}()
go func() {
if err := srv.ListenAndServe(); err != nil {
n.logger.Error("Prometheus HTTP server ListenAndServe", "err", err)
promCancel()
}
}()
return srv
}
// EventBus returns the Node's EventBus.
func (n *nodeImpl) EventBus() *eventbus.EventBus {
return n.rpcEnv.EventBus
}
// GenesisDoc returns the Node's GenesisDoc.
func (n *nodeImpl) GenesisDoc() *types.GenesisDoc {
return n.genesisDoc
}
// RPCEnvironment makes sure RPC has all the objects it needs to operate.
func (n *nodeImpl) RPCEnvironment() *rpccore.Environment {
return n.rpcEnv
}
//------------------------------------------------------------------------------
func (n *nodeImpl) Listeners() []string {
return []string{
fmt.Sprintf("Listener(@%v)", n.config.P2P.ExternalAddress),
}
}
func (n *nodeImpl) IsListening() bool {
return n.isListening
}
// NodeInfo returns the Node's Info from the Switch.
func (n *nodeImpl) NodeInfo() types.NodeInfo {
return n.nodeInfo
}
// genesisDocProvider returns a GenesisDoc.
// It allows the GenesisDoc to be pulled from sources other than the
// filesystem, for instance from a distributed key-value store cluster.
type genesisDocProvider func() (*types.GenesisDoc, error)
// defaultGenesisDocProviderFunc returns a GenesisDocProvider that loads
// the GenesisDoc from the config.GenesisFile() on the filesystem.
func defaultGenesisDocProviderFunc(cfg *config.Config) genesisDocProvider {
return func() (*types.GenesisDoc, error) {
return types.GenesisDocFromFile(cfg.GenesisFile())
}
}
type nodeMetrics struct {
consensus *consensus.Metrics
indexer *indexer.Metrics
mempool *mempool.Metrics
p2p *p2p.Metrics
proxy *proxy.Metrics
state *sm.Metrics
statesync *statesync.Metrics
}
// metricsProvider returns consensus, p2p, mempool, state, statesync Metrics.
type metricsProvider func(chainID string) *nodeMetrics
// defaultMetricsProvider returns Metrics build using Prometheus client library
// if Prometheus is enabled. Otherwise, it returns no-op Metrics.
func defaultMetricsProvider(cfg *config.InstrumentationConfig) metricsProvider {
return func(chainID string) *nodeMetrics {
if cfg.Prometheus {
return &nodeMetrics{
consensus: consensus.PrometheusMetrics(cfg.Namespace, "chain_id", chainID),
indexer: indexer.PrometheusMetrics(cfg.Namespace, "chain_id", chainID),
mempool: mempool.PrometheusMetrics(cfg.Namespace, "chain_id", chainID),
p2p: p2p.PrometheusMetrics(cfg.Namespace, "chain_id", chainID),
proxy: proxy.PrometheusMetrics(cfg.Namespace, "chain_id", chainID),
state: sm.PrometheusMetrics(cfg.Namespace, "chain_id", chainID),
statesync: statesync.PrometheusMetrics(cfg.Namespace, "chain_id", chainID),
}
}
return &nodeMetrics{
consensus: consensus.NopMetrics(),
indexer: indexer.NopMetrics(),
mempool: mempool.NopMetrics(),
p2p: p2p.NopMetrics(),
proxy: proxy.NopMetrics(),
state: sm.NopMetrics(),
statesync: statesync.NopMetrics(),
}
}
}
//------------------------------------------------------------------------------
// loadStateFromDBOrGenesisDocProvider attempts to load the state from the
// database, or creates one using the given genesisDocProvider. On success this also
// returns the genesis doc loaded through the given provider.
func loadStateFromDBOrGenesisDocProvider(
stateStore sm.Store,
genDoc *types.GenesisDoc,
) (sm.State, error) {
// 1. Attempt to load state form the database
state, err := stateStore.Load()
if err != nil {
return sm.State{}, err
}
if state.IsEmpty() {
// 2. If it's not there, derive it from the genesis doc
state, err = sm.MakeGenesisState(genDoc)
if err != nil {
return sm.State{}, err
}
}
return state, nil
}
func getRouterConfig(conf *config.Config, proxyApp proxy.AppConns) p2p.RouterOptions {
opts := p2p.RouterOptions{
QueueType: conf.P2P.QueueType,
}
if conf.FilterPeers && proxyApp != nil {
opts.FilterPeerByID = func(ctx context.Context, id types.NodeID) error {
res, err := proxyApp.Query().Query(ctx, abci.RequestQuery{
Path: fmt.Sprintf("/p2p/filter/id/%s", id),
})
if err != nil {
return err
}
if res.IsErr() {
return fmt.Errorf("error querying abci app: %v", res)
}
return nil
}
opts.FilterPeerByIP = func(ctx context.Context, ip net.IP, port uint16) error {
res, err := proxyApp.Query().Query(ctx, abci.RequestQuery{
Path: fmt.Sprintf("/p2p/filter/addr/%s", net.JoinHostPort(ip.String(), strconv.Itoa(int(port)))),
})
if err != nil {
return err
}
if res.IsErr() {
return fmt.Errorf("error querying abci app: %v", res)
}
return nil
}
}
return opts
}