make some tweaks to error handling

This commit is contained in:
Callum Waters
2021-08-27 12:23:07 +02:00
parent 0e45a1d0eb
commit 5baf7a7f0c
4 changed files with 19 additions and 32 deletions

View File

@@ -307,7 +307,7 @@ func (r *Reactor) Sync(ctx context.Context) (sm.State, error) {
err = r.Backfill(ctx, state)
if err != nil {
return sm.State{}, err
r.Logger.Error("backfill failed. Proceeding optimistically...", "err", err)
}
return state, nil

View File

@@ -45,9 +45,6 @@ var (
errTimeout = errors.New("timed out waiting for chunk")
// errNoSnapshots is returned by SyncAny() if no snapshots are found and discovery is disabled.
errNoSnapshots = errors.New("no suitable snapshots found")
// errStateProviderTimeout is returned by Sync() when the timeout for retrieving
// tendermint state or the commit is exceeded
errStateProviderTimeout = errors.New("state provider timed out")
)
// syncer runs a state sync against an ABCI app. Use either SyncAny() to automatically attempt to
@@ -153,8 +150,6 @@ func (s *syncer) SyncAny(
discoveryTime time.Duration,
requestSnapshots func(),
) (sm.State, *types.Commit, error) {
s.logger.Info("in sync any")
if discoveryTime != 0 && discoveryTime < minimumDiscoveryTime {
discoveryTime = minimumDiscoveryTime
}
@@ -182,7 +177,6 @@ func (s *syncer) SyncAny(
if discoveryTime == 0 {
return sm.State{}, nil, errNoSnapshots
}
// requestSnapshots()
s.logger.Info(fmt.Sprintf("Discovering snapshots for %v", discoveryTime))
time.Sleep(discoveryTime)
continue
@@ -195,9 +189,7 @@ func (s *syncer) SyncAny(
defer chunks.Close() // in case we forget to close it elsewhere
}
s.logger.Info("starting sync")
newState, commit, err := s.Sync(ctx, snapshot, chunks)
s.logger.Info("after sync")
switch {
case err == nil:
return newState, commit, nil
@@ -233,10 +225,6 @@ func (s *syncer) SyncAny(
s.logger.Info("Snapshot sender rejected", "peer", peer)
}
case errors.Is(err, errStateProviderTimeout):
s.logger.Info("Timed out retrieving state and commit, rejecting and retrying...", "height", snapshot.Height)
s.snapshots.Reject(snapshot)
default:
return sm.State{}, nil, fmt.Errorf("snapshot restoration failed: %w", err)
}
@@ -273,10 +261,13 @@ func (s *syncer) Sync(ctx context.Context, snapshot *snapshot, chunks *chunkQueu
// Fetch the app hash corresponding to the snapshot
appHash, err := s.stateProvider.AppHash(hctx, snapshot.Height)
if err != nil {
if err == context.DeadlineExceeded && ctx.Err() == nil {
return sm.State{}, nil, errStateProviderTimeout
// check if the main context was triggered
if ctx.Err() != nil {
return sm.State{}, nil, ctx.Err()
}
return sm.State{}, nil, fmt.Errorf("failed to get app hash: %w", err)
s.logger.Info("failed to get and verify tendermint state. Dropping snapshot and trying again",
"err", err, "height", snapshot.Height)
return sm.State{}, nil, errRejectSnapshot
}
snapshot.trustedAppHash = appHash
@@ -299,21 +290,23 @@ func (s *syncer) Sync(ctx context.Context, snapshot *snapshot, chunks *chunkQueu
// Optimistically build new state, so we don't discover any light client failures at the end.
state, err := s.stateProvider.State(pctx, snapshot.Height)
if err != nil {
// check if the provider context exceeded the 10 second deadline
if err == context.DeadlineExceeded && ctx.Err() == nil {
return sm.State{}, nil, errStateProviderTimeout
// check if the main context was triggered
if ctx.Err() != nil {
return sm.State{}, nil, ctx.Err()
}
return sm.State{}, nil, fmt.Errorf("failed to build new state: %w", err)
s.logger.Info("failed to get and verify tendermint state. Dropping snapshot and trying again",
"err", err, "height", snapshot.Height)
return sm.State{}, nil, errRejectSnapshot
}
commit, err := s.stateProvider.Commit(pctx, snapshot.Height)
if err != nil {
// check if the provider context exceeded the 10 second deadline
if err == context.DeadlineExceeded && ctx.Err() == nil {
return sm.State{}, nil, errStateProviderTimeout
if ctx.Err() != nil {
return sm.State{}, nil, ctx.Err()
}
return sm.State{}, nil, fmt.Errorf("failed to fetch commit: %w", err)
s.logger.Info("failed to get and verify commit. Dropping snapshot and trying again",
"err", err, "height", snapshot.Height)
return sm.State{}, nil, errRejectSnapshot
}
// Restore snapshot

View File

@@ -75,7 +75,7 @@ mode = "full"
start_at = 1010
# FIXME: should be v2, disabled due to flake
block_sync = "v0"
persistent_peers = ["validator01", "validator02", "validator03", "validator04", "validator05"]
persistent_peers = ["validator01", "validator02", "validator03", "validator04"]
perturb = ["restart"]
retain_blocks = 7
state_sync = "rpc"

View File

@@ -85,17 +85,11 @@ func Start(testnet *e2e.Testnet) error {
networkHeight = node.StartAt
logger.Info("Waiting for network to advance before starting catch up node",
"node", node.Name,
"height", networkHeight)
if _, _, err := waitForHeight(testnet, networkHeight); err != nil {
return err
}
}
logger.Info("Starting catch up node", "node", node.Name, "height", node.StartAt)
if err := execCompose(testnet.Dir, "up", "-d", node.Name); err != nil {
return err
}