mirror of
https://github.com/tendermint/tendermint.git
synced 2026-01-08 14:21:14 +00:00
consensus: attempt to repair the WAL file on data corruption (#4682)
Closes: #4578 Co-authored-by: Anton Kaliaev <anton.kalyaev@gmail.com>
This commit is contained in:
@@ -4,6 +4,7 @@ import (
|
||||
"bytes"
|
||||
"errors"
|
||||
"fmt"
|
||||
"os"
|
||||
"reflect"
|
||||
"runtime/debug"
|
||||
"sync"
|
||||
@@ -276,23 +277,63 @@ func (cs *State) LoadCommit(height int64) *types.Commit {
|
||||
return cs.blockStore.LoadBlockCommit(height)
|
||||
}
|
||||
|
||||
// OnStart implements service.Service.
|
||||
// It loads the latest state via the WAL, and starts the timeout and receive routines.
|
||||
// OnStart loads the latest state via the WAL, and starts the timeout and
|
||||
// receive routines.
|
||||
func (cs *State) OnStart() error {
|
||||
if err := cs.evsw.Start(); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// we may set the WAL in testing before calling Start,
|
||||
// so only OpenWAL if its still the nilWAL
|
||||
// We may set the WAL in testing before calling Start, so only OpenWAL if its
|
||||
// still the nilWAL.
|
||||
if _, ok := cs.wal.(nilWAL); ok {
|
||||
walFile := cs.config.WalFile()
|
||||
wal, err := cs.OpenWAL(walFile)
|
||||
if err != nil {
|
||||
cs.Logger.Error("Error loading State wal", "err", err.Error())
|
||||
if err := cs.loadWalFile(); err != nil {
|
||||
return err
|
||||
}
|
||||
cs.wal = wal
|
||||
}
|
||||
|
||||
// We may have lost some votes if the process crashed reload from consensus
|
||||
// log to catchup.
|
||||
if cs.doWALCatchup {
|
||||
repairAttempted := false
|
||||
LOOP:
|
||||
for {
|
||||
err := cs.catchupReplay(cs.Height)
|
||||
switch {
|
||||
case err == nil:
|
||||
break LOOP
|
||||
case !IsDataCorruptionError(err):
|
||||
cs.Logger.Error("Error on catchup replay. Proceeding to start State anyway", "err", err)
|
||||
break LOOP
|
||||
case repairAttempted:
|
||||
return err
|
||||
}
|
||||
|
||||
cs.Logger.Info("WAL file is corrupted. Attempting repair", "err", err)
|
||||
|
||||
// 1) prep work
|
||||
cs.wal.Stop()
|
||||
repairAttempted = true
|
||||
|
||||
// 2) backup original WAL file
|
||||
corruptedFile := fmt.Sprintf("%s.CORRUPTED", cs.config.WalFile())
|
||||
if err := tmos.CopyFile(cs.config.WalFile(), corruptedFile); err != nil {
|
||||
return err
|
||||
}
|
||||
cs.Logger.Info("Backed up WAL file", "src", cs.config.WalFile(), "dst", corruptedFile)
|
||||
|
||||
// 3) try to repair (WAL file will be overwritten!)
|
||||
if err := repairWalFile(corruptedFile, cs.config.WalFile()); err != nil {
|
||||
cs.Logger.Error("Repair failed", "err", err)
|
||||
return err
|
||||
}
|
||||
cs.Logger.Info("Successful repair")
|
||||
|
||||
// reload WAL file
|
||||
if err := cs.loadWalFile(); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if err := cs.evsw.Start(); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// we need the timeoutRoutine for replay so
|
||||
@@ -304,33 +345,6 @@ func (cs *State) OnStart() error {
|
||||
return err
|
||||
}
|
||||
|
||||
// we may have lost some votes if the process crashed
|
||||
// reload from consensus log to catchup
|
||||
if cs.doWALCatchup {
|
||||
if err := cs.catchupReplay(cs.Height); err != nil {
|
||||
// don't try to recover from data corruption error
|
||||
if IsDataCorruptionError(err) {
|
||||
cs.Logger.Error("Encountered corrupt WAL file", "err", err.Error())
|
||||
cs.Logger.Error("Please repair the WAL file before restarting")
|
||||
fmt.Println(`You can attempt to repair the WAL as follows:
|
||||
|
||||
----
|
||||
WALFILE=~/.tendermint/data/cs.wal/wal
|
||||
cp $WALFILE ${WALFILE}.bak # backup the file
|
||||
go run scripts/wal2json/main.go $WALFILE > wal.json # this will panic, but can be ignored
|
||||
rm $WALFILE # remove the corrupt file
|
||||
go run scripts/json2wal/main.go wal.json $WALFILE # rebuild the file without corruption
|
||||
----`)
|
||||
|
||||
return err
|
||||
}
|
||||
|
||||
cs.Logger.Error("Error on catchup replay. Proceeding to start State anyway", "err", err.Error())
|
||||
// NOTE: if we ever do return an error here,
|
||||
// make sure to stop the timeoutTicker
|
||||
}
|
||||
}
|
||||
|
||||
// now start the receiveRoutine
|
||||
go cs.receiveRoutine(0)
|
||||
|
||||
@@ -352,6 +366,17 @@ func (cs *State) startRoutines(maxSteps int) {
|
||||
go cs.receiveRoutine(maxSteps)
|
||||
}
|
||||
|
||||
// loadWalFile loads WAL data from file. It overwrites cs.wal.
|
||||
func (cs *State) loadWalFile() error {
|
||||
wal, err := cs.OpenWAL(cs.config.WalFile())
|
||||
if err != nil {
|
||||
cs.Logger.Error("Error loading State wal", "err", err)
|
||||
return err
|
||||
}
|
||||
cs.wal = wal
|
||||
return nil
|
||||
}
|
||||
|
||||
// OnStop implements service.Service.
|
||||
func (cs *State) OnStop() {
|
||||
cs.evsw.Stop()
|
||||
@@ -366,15 +391,17 @@ func (cs *State) Wait() {
|
||||
<-cs.done
|
||||
}
|
||||
|
||||
// OpenWAL opens a file to log all consensus messages and timeouts for deterministic accountability
|
||||
// OpenWAL opens a file to log all consensus messages and timeouts for
|
||||
// deterministic accountability.
|
||||
func (cs *State) OpenWAL(walFile string) (WAL, error) {
|
||||
wal, err := NewWAL(walFile)
|
||||
if err != nil {
|
||||
cs.Logger.Error("Failed to open WAL for consensus state", "wal", walFile, "err", err)
|
||||
cs.Logger.Error("Failed to open WAL", "file", walFile, "err", err)
|
||||
return nil, err
|
||||
}
|
||||
wal.SetLogger(cs.Logger.With("wal", walFile))
|
||||
if err := wal.Start(); err != nil {
|
||||
cs.Logger.Error("Failed to start WAL", "err", err)
|
||||
return nil, err
|
||||
}
|
||||
return wal, nil
|
||||
@@ -2034,3 +2061,39 @@ func CompareHRS(h1 int64, r1 int, s1 cstypes.RoundStepType, h2 int64, r2 int, s2
|
||||
}
|
||||
return 0
|
||||
}
|
||||
|
||||
// repairWalFile decodes messages from src (until the decoder errors) and
|
||||
// writes them to dst.
|
||||
func repairWalFile(src, dst string) error {
|
||||
in, err := os.Open(src)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer in.Close()
|
||||
|
||||
out, err := os.Open(dst)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer out.Close()
|
||||
|
||||
var (
|
||||
dec = NewWALDecoder(in)
|
||||
enc = NewWALEncoder(out)
|
||||
)
|
||||
|
||||
// best-case repair (until first error is encountered)
|
||||
for {
|
||||
msg, err := dec.Decode()
|
||||
if err != nil {
|
||||
break
|
||||
}
|
||||
|
||||
err = enc.Encode(msg)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to encode msg: %w", err)
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user