From 39eba4e1543960e7783581ec75d2e531e8d44afa Mon Sep 17 00:00:00 2001 From: Ethan Buchman Date: Mon, 4 Feb 2019 13:00:06 -0500 Subject: [PATCH] WAL: better errors and new fail point (#3246) * privval: more info in errors * wal: change Debug logs to Info * wal: log and return error on corrupted wal instead of panicing * fail: Exit right away instead of sending interupt * consensus: FAIL before handling our own vote allows to replicate #3089: - run using `FAIL_TEST_INDEX=0` - delete some bytes from the end of the WAL - start normally Results in logs like: ``` I[2019-02-03|18:12:58.225] Searching for height module=consensus wal=/Users/ethanbuchman/.tendermint/data/cs.wal/wal height=1 min=0 max=0 E[2019-02-03|18:12:58.225] Error on catchup replay. Proceeding to start ConsensusState anyway module=consensus err="failed to read data: EOF" I[2019-02-03|18:12:58.225] Started node module=main nodeInfo="{ProtocolVersion:{P2P:6 Block:9 App:1} ID_:35e87e93f2e31f305b65a5517fd2102331b56002 ListenAddr:tcp://0.0.0.0:26656 Network:test-chain-J8JvJH Version:0.29.1 Channels:4020212223303800 Moniker:Ethans-MacBook-Pro.local Other:{TxIndex:on RPCAddress:tcp://0.0.0.0:26657}}" E[2019-02-03|18:12:58.226] Couldn't connect to any seeds module=p2p I[2019-02-03|18:12:59.229] Timed out module=consensus dur=998.568ms height=1 round=0 step=RoundStepNewHeight I[2019-02-03|18:12:59.230] enterNewRound(1/0). Current: 1/0/RoundStepNewHeight module=consensus height=1 round=0 I[2019-02-03|18:12:59.230] enterPropose(1/0). Current: 1/0/RoundStepNewRound module=consensus height=1 round=0 I[2019-02-03|18:12:59.230] enterPropose: Our turn to propose module=consensus height=1 round=0 proposer=AD278B7767B05D7FBEB76207024C650988FA77D5 privValidator="PrivValidator{AD278B7767B05D7FBEB76207024C650988FA77D5 LH:1, LR:0, LS:2}" E[2019-02-03|18:12:59.230] enterPropose: Error signing proposal module=consensus height=1 round=0 err="Error signing proposal: Step regression at height 1 round 0. Got 1, last step 2" I[2019-02-03|18:13:02.233] Timed out module=consensus dur=3s height=1 round=0 step=RoundStepPropose I[2019-02-03|18:13:02.233] enterPrevote(1/0). Current: 1/0/RoundStepPropose module=consensus I[2019-02-03|18:13:02.233] enterPrevote: ProposalBlock is nil module=consensus height=1 round=0 E[2019-02-03|18:13:02.234] Error signing vote module=consensus height=1 round=0 vote="Vote{0:AD278B7767B0 1/00/1(Prevote) 000000000000 000000000000 @ 2019-02-04T02:13:02.233897Z}" err="Error signing vote: Conflicting data" ``` Notice the EOF, the step regression, and the conflicting data. * wal: change errors to be DataCorruptionError * exit on corrupt WAL * fix log * fix new line --- consensus/replay.go | 4 ++-- consensus/state.go | 26 ++++++++++++++++++++++++++ consensus/wal.go | 14 +++++++------- libs/fail/fail.go | 5 +++-- privval/file.go | 6 +++--- 5 files changed, 41 insertions(+), 14 deletions(-) diff --git a/consensus/replay.go b/consensus/replay.go index 3ac636577..21fef6b29 100644 --- a/consensus/replay.go +++ b/consensus/replay.go @@ -144,8 +144,8 @@ func (cs *ConsensusState) catchupReplay(csHeight int64) error { if err == io.EOF { break } else if IsDataCorruptionError(err) { - cs.Logger.Debug("data has been corrupted in last height of consensus WAL", "err", err, "height", csHeight) - panic(fmt.Sprintf("data has been corrupted (%v) in last height %d of consensus WAL", err, csHeight)) + cs.Logger.Error("data has been corrupted in last height of consensus WAL", "err", err, "height", csHeight) + return err } else if err != nil { return err } diff --git a/consensus/state.go b/consensus/state.go index 158e1605e..cec7e5f5e 100644 --- a/consensus/state.go +++ b/consensus/state.go @@ -307,6 +307,23 @@ func (cs *ConsensusState) OnStart() error { // reload from consensus log to catchup if cs.doWALCatchup { if err := cs.catchupReplay(cs.Height); err != nil { + // don't try to recover from data corruption error + if IsDataCorruptionError(err) { + cs.Logger.Error("Encountered corrupt WAL file", "err", err.Error()) + cs.Logger.Error("Please repair the WAL file before restarting") + fmt.Println(`You can attempt to repair the WAL as follows: + +---- +WALFILE=~/.tendermint/data/cs.wal/wal +cp $WALFILE ${WALFILE}.bak # backup the file +go run scripts/wal2json/main.go $WALFILE > wal.json # this will panic, but can be ignored +rm $WALFILE # remove the corrupt file +go run scripts/json2wal/main.go wal.json $WALFILE # rebuild the file without corruption +----`) + + return err + } + cs.Logger.Error("Error on catchup replay. Proceeding to start ConsensusState anyway", "err", err.Error()) // NOTE: if we ever do return an error here, // make sure to stop the timeoutTicker @@ -624,6 +641,15 @@ func (cs *ConsensusState) receiveRoutine(maxSteps int) { cs.handleMsg(mi) case mi = <-cs.internalMsgQueue: cs.wal.WriteSync(mi) // NOTE: fsync + + if _, ok := mi.Msg.(*VoteMessage); ok { + // we actually want to simulate failing during + // the previous WriteSync, but this isn't easy to do. + // Equivalent would be to fail here and manually remove + // some bytes from the end of the wal. + fail.Fail() // XXX + } + // handles proposals, block parts, votes cs.handleMsg(mi) case ti := <-cs.timeoutTicker.Chan(): // tockChan: diff --git a/consensus/wal.go b/consensus/wal.go index bbc9908fb..ba89cd1aa 100644 --- a/consensus/wal.go +++ b/consensus/wal.go @@ -163,7 +163,7 @@ func (wal *baseWAL) SearchForEndHeight(height int64, options *WALSearchOptions) // NOTE: starting from the last file in the group because we're usually // searching for the last height. See replay.go min, max := wal.group.MinIndex(), wal.group.MaxIndex() - wal.Logger.Debug("Searching for height", "height", height, "min", min, "max", max) + wal.Logger.Info("Searching for height", "height", height, "min", min, "max", max) for index := max; index >= min; index-- { gr, err = wal.group.NewReader(index) if err != nil { @@ -183,7 +183,7 @@ func (wal *baseWAL) SearchForEndHeight(height int64, options *WALSearchOptions) break } if options.IgnoreDataCorruptionErrors && IsDataCorruptionError(err) { - wal.Logger.Debug("Corrupted entry. Skipping...", "err", err) + wal.Logger.Error("Corrupted entry. Skipping...", "err", err) // do nothing continue } else if err != nil { @@ -194,7 +194,7 @@ func (wal *baseWAL) SearchForEndHeight(height int64, options *WALSearchOptions) if m, ok := msg.Msg.(EndHeightMessage); ok { lastHeightFound = m.Height if m.Height == height { // found - wal.Logger.Debug("Found", "height", height, "index", index) + wal.Logger.Info("Found", "height", height, "index", index) return gr, true, nil } } @@ -281,25 +281,25 @@ func (dec *WALDecoder) Decode() (*TimedWALMessage, error) { return nil, err } if err != nil { - return nil, fmt.Errorf("failed to read checksum: %v", err) + return nil, DataCorruptionError{fmt.Errorf("failed to read checksum: %v", err)} } crc := binary.BigEndian.Uint32(b) b = make([]byte, 4) _, err = dec.rd.Read(b) if err != nil { - return nil, fmt.Errorf("failed to read length: %v", err) + return nil, DataCorruptionError{fmt.Errorf("failed to read length: %v", err)} } length := binary.BigEndian.Uint32(b) if length > maxMsgSizeBytes { - return nil, fmt.Errorf("length %d exceeded maximum possible value of %d bytes", length, maxMsgSizeBytes) + return nil, DataCorruptionError{fmt.Errorf("length %d exceeded maximum possible value of %d bytes", length, maxMsgSizeBytes)} } data := make([]byte, length) _, err = dec.rd.Read(data) if err != nil { - return nil, fmt.Errorf("failed to read data: %v", err) + return nil, DataCorruptionError{fmt.Errorf("failed to read data: %v", err)} } // check checksum before decoding data diff --git a/libs/fail/fail.go b/libs/fail/fail.go index edfca13e3..d7912af5c 100644 --- a/libs/fail/fail.go +++ b/libs/fail/fail.go @@ -72,7 +72,8 @@ func FailRand(n int) { func Exit() { fmt.Printf("*** fail-test %d ***\n", callIndex) - proc, _ := os.FindProcess(os.Getpid()) - proc.Signal(os.Interrupt) + os.Exit(1) + // proc, _ := os.FindProcess(os.Getpid()) + // proc.Signal(os.Interrupt) // panic(fmt.Sprintf("*** fail-test %d ***", callIndex)) } diff --git a/privval/file.go b/privval/file.go index 8072cfa4a..d27d7a788 100644 --- a/privval/file.go +++ b/privval/file.go @@ -87,17 +87,17 @@ type FilePVLastSignState struct { func (lss *FilePVLastSignState) CheckHRS(height int64, round int, step int8) (bool, error) { if lss.Height > height { - return false, errors.New("Height regression") + return false, fmt.Errorf("Height regression. Got %v, last height %v", height, lss.Height) } if lss.Height == height { if lss.Round > round { - return false, errors.New("Round regression") + return false, fmt.Errorf("Round regression at height %v. Got %v, last round %v", height, round, lss.Round) } if lss.Round == round { if lss.Step > step { - return false, errors.New("Step regression") + return false, fmt.Errorf("Step regression at height %v round %v. Got %v, last step %v", height, round, step, lss.Step) } else if lss.Step == step { if lss.SignBytes != nil { if lss.Signature == nil {