consensus: attempt to repair the WAL file on data corruption (#4682)

Closes: #4578

Co-authored-by: Anton Kaliaev <anton.kalyaev@gmail.com>
This commit is contained in:
Alessio Treglia
2020-06-03 13:14:12 +02:00
committed by GitHub
parent c2578e2262
commit c8483531d8
4 changed files with 169 additions and 42 deletions

View File

@@ -4,6 +4,7 @@ import (
"bytes"
"errors"
"fmt"
"os"
"reflect"
"runtime/debug"
"sync"
@@ -276,23 +277,63 @@ func (cs *State) LoadCommit(height int64) *types.Commit {
return cs.blockStore.LoadBlockCommit(height)
}
// OnStart implements service.Service.
// It loads the latest state via the WAL, and starts the timeout and receive routines.
// OnStart loads the latest state via the WAL, and starts the timeout and
// receive routines.
func (cs *State) OnStart() error {
if err := cs.evsw.Start(); err != nil {
return err
}
// we may set the WAL in testing before calling Start,
// so only OpenWAL if its still the nilWAL
// We may set the WAL in testing before calling Start, so only OpenWAL if its
// still the nilWAL.
if _, ok := cs.wal.(nilWAL); ok {
walFile := cs.config.WalFile()
wal, err := cs.OpenWAL(walFile)
if err != nil {
cs.Logger.Error("Error loading State wal", "err", err.Error())
if err := cs.loadWalFile(); err != nil {
return err
}
cs.wal = wal
}
// We may have lost some votes if the process crashed reload from consensus
// log to catchup.
if cs.doWALCatchup {
repairAttempted := false
LOOP:
for {
err := cs.catchupReplay(cs.Height)
switch {
case err == nil:
break LOOP
case !IsDataCorruptionError(err):
cs.Logger.Error("Error on catchup replay. Proceeding to start State anyway", "err", err)
break LOOP
case repairAttempted:
return err
}
cs.Logger.Info("WAL file is corrupted. Attempting repair", "err", err)
// 1) prep work
cs.wal.Stop()
repairAttempted = true
// 2) backup original WAL file
corruptedFile := fmt.Sprintf("%s.CORRUPTED", cs.config.WalFile())
if err := tmos.CopyFile(cs.config.WalFile(), corruptedFile); err != nil {
return err
}
cs.Logger.Info("Backed up WAL file", "src", cs.config.WalFile(), "dst", corruptedFile)
// 3) try to repair (WAL file will be overwritten!)
if err := repairWalFile(corruptedFile, cs.config.WalFile()); err != nil {
cs.Logger.Error("Repair failed", "err", err)
return err
}
cs.Logger.Info("Successful repair")
// reload WAL file
if err := cs.loadWalFile(); err != nil {
return err
}
}
}
if err := cs.evsw.Start(); err != nil {
return err
}
// we need the timeoutRoutine for replay so
@@ -304,33 +345,6 @@ func (cs *State) OnStart() error {
return err
}
// we may have lost some votes if the process crashed
// reload from consensus log to catchup
if cs.doWALCatchup {
if err := cs.catchupReplay(cs.Height); err != nil {
// don't try to recover from data corruption error
if IsDataCorruptionError(err) {
cs.Logger.Error("Encountered corrupt WAL file", "err", err.Error())
cs.Logger.Error("Please repair the WAL file before restarting")
fmt.Println(`You can attempt to repair the WAL as follows:
----
WALFILE=~/.tendermint/data/cs.wal/wal
cp $WALFILE ${WALFILE}.bak # backup the file
go run scripts/wal2json/main.go $WALFILE > wal.json # this will panic, but can be ignored
rm $WALFILE # remove the corrupt file
go run scripts/json2wal/main.go wal.json $WALFILE # rebuild the file without corruption
----`)
return err
}
cs.Logger.Error("Error on catchup replay. Proceeding to start State anyway", "err", err.Error())
// NOTE: if we ever do return an error here,
// make sure to stop the timeoutTicker
}
}
// now start the receiveRoutine
go cs.receiveRoutine(0)
@@ -352,6 +366,17 @@ func (cs *State) startRoutines(maxSteps int) {
go cs.receiveRoutine(maxSteps)
}
// loadWalFile loads WAL data from file. It overwrites cs.wal.
func (cs *State) loadWalFile() error {
wal, err := cs.OpenWAL(cs.config.WalFile())
if err != nil {
cs.Logger.Error("Error loading State wal", "err", err)
return err
}
cs.wal = wal
return nil
}
// OnStop implements service.Service.
func (cs *State) OnStop() {
cs.evsw.Stop()
@@ -366,15 +391,17 @@ func (cs *State) Wait() {
<-cs.done
}
// OpenWAL opens a file to log all consensus messages and timeouts for deterministic accountability
// OpenWAL opens a file to log all consensus messages and timeouts for
// deterministic accountability.
func (cs *State) OpenWAL(walFile string) (WAL, error) {
wal, err := NewWAL(walFile)
if err != nil {
cs.Logger.Error("Failed to open WAL for consensus state", "wal", walFile, "err", err)
cs.Logger.Error("Failed to open WAL", "file", walFile, "err", err)
return nil, err
}
wal.SetLogger(cs.Logger.With("wal", walFile))
if err := wal.Start(); err != nil {
cs.Logger.Error("Failed to start WAL", "err", err)
return nil, err
}
return wal, nil
@@ -2034,3 +2061,39 @@ func CompareHRS(h1 int64, r1 int, s1 cstypes.RoundStepType, h2 int64, r2 int, s2
}
return 0
}
// repairWalFile decodes messages from src (until the decoder errors) and
// writes them to dst.
func repairWalFile(src, dst string) error {
in, err := os.Open(src)
if err != nil {
return err
}
defer in.Close()
out, err := os.Open(dst)
if err != nil {
return err
}
defer out.Close()
var (
dec = NewWALDecoder(in)
enc = NewWALEncoder(out)
)
// best-case repair (until first error is encountered)
for {
msg, err := dec.Decode()
if err != nil {
break
}
err = enc.Encode(msg)
if err != nil {
return fmt.Errorf("failed to encode msg: %w", err)
}
}
return nil
}