diff --git a/internal/statesync/reactor.go b/internal/statesync/reactor.go index 0836a3e34..12d42baa0 100644 --- a/internal/statesync/reactor.go +++ b/internal/statesync/reactor.go @@ -307,7 +307,7 @@ func (r *Reactor) Sync(ctx context.Context) (sm.State, error) { err = r.Backfill(ctx, state) if err != nil { - return sm.State{}, err + r.Logger.Error("backfill failed. Proceeding optimistically...", "err", err) } return state, nil diff --git a/internal/statesync/syncer.go b/internal/statesync/syncer.go index b74349395..ef5c72209 100644 --- a/internal/statesync/syncer.go +++ b/internal/statesync/syncer.go @@ -45,9 +45,6 @@ var ( errTimeout = errors.New("timed out waiting for chunk") // errNoSnapshots is returned by SyncAny() if no snapshots are found and discovery is disabled. errNoSnapshots = errors.New("no suitable snapshots found") - // errStateProviderTimeout is returned by Sync() when the timeout for retrieving - // tendermint state or the commit is exceeded - errStateProviderTimeout = errors.New("state provider timed out") ) // syncer runs a state sync against an ABCI app. Use either SyncAny() to automatically attempt to @@ -153,8 +150,6 @@ func (s *syncer) SyncAny( discoveryTime time.Duration, requestSnapshots func(), ) (sm.State, *types.Commit, error) { - s.logger.Info("in sync any") - if discoveryTime != 0 && discoveryTime < minimumDiscoveryTime { discoveryTime = minimumDiscoveryTime } @@ -182,7 +177,6 @@ func (s *syncer) SyncAny( if discoveryTime == 0 { return sm.State{}, nil, errNoSnapshots } - // requestSnapshots() s.logger.Info(fmt.Sprintf("Discovering snapshots for %v", discoveryTime)) time.Sleep(discoveryTime) continue @@ -195,9 +189,7 @@ func (s *syncer) SyncAny( defer chunks.Close() // in case we forget to close it elsewhere } - s.logger.Info("starting sync") newState, commit, err := s.Sync(ctx, snapshot, chunks) - s.logger.Info("after sync") switch { case err == nil: return newState, commit, nil @@ -233,10 +225,6 @@ func (s *syncer) SyncAny( s.logger.Info("Snapshot sender rejected", "peer", peer) } - case errors.Is(err, errStateProviderTimeout): - s.logger.Info("Timed out retrieving state and commit, rejecting and retrying...", "height", snapshot.Height) - s.snapshots.Reject(snapshot) - default: return sm.State{}, nil, fmt.Errorf("snapshot restoration failed: %w", err) } @@ -273,10 +261,13 @@ func (s *syncer) Sync(ctx context.Context, snapshot *snapshot, chunks *chunkQueu // Fetch the app hash corresponding to the snapshot appHash, err := s.stateProvider.AppHash(hctx, snapshot.Height) if err != nil { - if err == context.DeadlineExceeded && ctx.Err() == nil { - return sm.State{}, nil, errStateProviderTimeout + // check if the main context was triggered + if ctx.Err() != nil { + return sm.State{}, nil, ctx.Err() } - return sm.State{}, nil, fmt.Errorf("failed to get app hash: %w", err) + s.logger.Info("failed to get and verify tendermint state. Dropping snapshot and trying again", + "err", err, "height", snapshot.Height) + return sm.State{}, nil, errRejectSnapshot } snapshot.trustedAppHash = appHash @@ -299,21 +290,23 @@ func (s *syncer) Sync(ctx context.Context, snapshot *snapshot, chunks *chunkQueu // Optimistically build new state, so we don't discover any light client failures at the end. state, err := s.stateProvider.State(pctx, snapshot.Height) if err != nil { - // check if the provider context exceeded the 10 second deadline - if err == context.DeadlineExceeded && ctx.Err() == nil { - return sm.State{}, nil, errStateProviderTimeout + // check if the main context was triggered + if ctx.Err() != nil { + return sm.State{}, nil, ctx.Err() } - - return sm.State{}, nil, fmt.Errorf("failed to build new state: %w", err) + s.logger.Info("failed to get and verify tendermint state. Dropping snapshot and trying again", + "err", err, "height", snapshot.Height) + return sm.State{}, nil, errRejectSnapshot } commit, err := s.stateProvider.Commit(pctx, snapshot.Height) if err != nil { // check if the provider context exceeded the 10 second deadline - if err == context.DeadlineExceeded && ctx.Err() == nil { - return sm.State{}, nil, errStateProviderTimeout + if ctx.Err() != nil { + return sm.State{}, nil, ctx.Err() } - - return sm.State{}, nil, fmt.Errorf("failed to fetch commit: %w", err) + s.logger.Info("failed to get and verify commit. Dropping snapshot and trying again", + "err", err, "height", snapshot.Height) + return sm.State{}, nil, errRejectSnapshot } // Restore snapshot diff --git a/test/e2e/networks/ci.toml b/test/e2e/networks/ci.toml index e9bb13515..d25701859 100644 --- a/test/e2e/networks/ci.toml +++ b/test/e2e/networks/ci.toml @@ -75,7 +75,7 @@ mode = "full" start_at = 1010 # FIXME: should be v2, disabled due to flake block_sync = "v0" -persistent_peers = ["validator01", "validator02", "validator03", "validator04", "validator05"] +persistent_peers = ["validator01", "validator02", "validator03", "validator04"] perturb = ["restart"] retain_blocks = 7 state_sync = "rpc" diff --git a/test/e2e/runner/start.go b/test/e2e/runner/start.go index c3152cb68..b85d6c569 100644 --- a/test/e2e/runner/start.go +++ b/test/e2e/runner/start.go @@ -85,17 +85,11 @@ func Start(testnet *e2e.Testnet) error { networkHeight = node.StartAt - logger.Info("Waiting for network to advance before starting catch up node", - "node", node.Name, - "height", networkHeight) - if _, _, err := waitForHeight(testnet, networkHeight); err != nil { return err } } - logger.Info("Starting catch up node", "node", node.Name, "height", node.StartAt) - if err := execCompose(testnet.Dir, "up", "-d", node.Name); err != nil { return err }