Files
seaweedfs/weed/command/fix.go
Chris Lu 303c2be38d feat(fix): rebuild lost EC index (.ecx) and .vif from local shards (#9596)
weed fix -ecx reconstructs the .dat from the local data shards, scans the
needles, and writes a fresh ascending-sorted .ecx containing only live
entries — the same on-disk index WriteSortedFileFromIdx emits at encode
time. When the .vif is also missing it is regenerated from the inferred
EC ratio (flags > .vif > shard-count inference / 10+4) and the .dat size
recovered from the scan.

When some data shards are missing but at least dataShards shards survive,
the missing shards are first reconstructed from the survivors via
Reed-Solomon, so a partial shard set is repaired too.

Also makes erasure_coding.WriteDatFile de-stripe using len(shardFileNames)
instead of the DataShardsCount constant, so the caller's actual data-shard
count is honored (behavior-preserving for the default 10, and fixing the
existing caller that already passes ECContext.DataShards).

This recovers an EC volume whose sealed index was lost from every node
while enough shards survive, a state neither ec.rebuild nor ec.decode can
repair because both require an existing .ecx.

Flags: -ecx, -ecDataShards, -ecParityShards. Run with the volume server
stopped.
2026-05-21 00:41:27 -07:00

501 lines
17 KiB
Go

package command
import (
"fmt"
"io"
"io/fs"
"os"
"path"
"strconv"
"strings"
"github.com/seaweedfs/seaweedfs/weed/glog"
"github.com/seaweedfs/seaweedfs/weed/pb/volume_server_pb"
"github.com/seaweedfs/seaweedfs/weed/storage"
"github.com/seaweedfs/seaweedfs/weed/storage/backend"
"github.com/seaweedfs/seaweedfs/weed/storage/erasure_coding"
"github.com/seaweedfs/seaweedfs/weed/storage/needle"
"github.com/seaweedfs/seaweedfs/weed/storage/needle_map"
"github.com/seaweedfs/seaweedfs/weed/storage/super_block"
"github.com/seaweedfs/seaweedfs/weed/storage/types"
"github.com/seaweedfs/seaweedfs/weed/storage/volume_info"
"github.com/seaweedfs/seaweedfs/weed/util"
)
func init() {
cmdFix.Run = runFix // break init cycle
}
var cmdFix = &Command{
UsageLine: "fix [-remoteFile=false] [-volumeId=234] [-collection=bigData] /tmp",
Short: "run weed tool fix on files or whole folders to recreate index file(s) if corrupted",
Long: `Fix runs the SeaweedFS fix command on local dat files ( or remote files) or whole folders to re-create the index .idx file. If fixing remote files, you need to synchronize master.toml to the same directory on the current node as on the master node.
You Need to stop the volume server when running this command.
Use -ecx to rebuild a lost EC index (.ecx) — and the .vif when missing — from the local .ec## shards.
`,
}
var (
fixVolumeCollection = cmdFix.Flag.String("collection", "", "an optional volume collection name, if specified only it will be processed")
fixVolumeId = cmdFix.Flag.Int64("volumeId", 0, "an optional volume id, if not 0 (default) only it will be processed")
fixIncludeDeleted = cmdFix.Flag.Bool("includeDeleted", true, "include deleted entries in the index file")
fixIgnoreError = cmdFix.Flag.Bool("ignoreError", false, "an optional, if true will be processed despite errors")
fixRemoteFile = cmdFix.Flag.Bool("remoteFile", false, "an optional, if true will not try to load the local .dat file, but only the remote file")
fixGenerateEcx = cmdFix.Flag.Bool("ecx", false, "regenerate a lost EC index (.ecx) — and the .vif when missing — from the local .ec## shards (missing shards are reconstructed from parity when enough survive). Run with the volume server stopped.")
fixEcDataShards = cmdFix.Flag.Int("ecDataShards", 0, "EC data shard count for -ecx (0 = read from .vif, otherwise default 10)")
fixEcParityShards = cmdFix.Flag.Int("ecParityShards", 0, "EC parity shard count for -ecx (0 = read from .vif, infer from shard count, otherwise default 4)")
)
type VolumeFileScanner4Fix struct {
version needle.Version
nm *needle_map.MemDb
nmDeleted *needle_map.MemDb
includeDeleted bool
}
func (scanner *VolumeFileScanner4Fix) VisitSuperBlock(superBlock super_block.SuperBlock) error {
scanner.version = superBlock.Version
return nil
}
func (scanner *VolumeFileScanner4Fix) ReadNeedleBody() bool {
return false
}
func (scanner *VolumeFileScanner4Fix) VisitNeedle(n *needle.Needle, offset int64, needleHeader, needleBody []byte) error {
glog.V(2).Infof("key %v offset %d size %d disk_size %d compressed %v", n.Id, offset, n.Size, n.DiskSize(scanner.version), n.IsCompressed())
if n.Size.IsValid() {
if pe := scanner.nm.Set(n.Id, types.ToOffset(offset), n.Size); pe != nil {
return fmt.Errorf("saved %d with error %v", n.Size, pe)
}
} else {
if scanner.includeDeleted {
if pe := scanner.nmDeleted.Set(n.Id, types.ToOffset(offset), types.TombstoneFileSize); pe != nil {
return fmt.Errorf("saved deleted %d with error %v", n.Size, pe)
}
} else {
glog.V(2).Infof("skipping deleted file ...")
return scanner.nm.Delete(n.Id)
}
}
return nil
}
func runFix(cmd *Command, args []string) bool {
for _, arg := range args {
basePath, f := path.Split(util.ResolvePath(arg))
if util.FolderExists(arg) {
basePath = arg
f = ""
}
files := []fs.DirEntry{}
if f == "" {
fileInfo, err := os.ReadDir(basePath)
if err != nil {
fmt.Println(err)
return false
}
files = fileInfo
} else {
fileInfo, err := os.Stat(arg)
if err != nil {
fmt.Println(err)
return false
}
files = []fs.DirEntry{fs.FileInfoToDirEntry(fileInfo)}
}
ext := ".dat"
if *fixRemoteFile {
ext = ".idx"
util.LoadConfiguration("master", false)
backend.LoadConfiguration(util.GetViper())
}
for _, file := range files {
if !strings.HasSuffix(file.Name(), ext) {
continue
}
if *fixVolumeCollection != "" {
if !strings.HasPrefix(file.Name(), *fixVolumeCollection+"_") {
continue
}
}
baseFileName := file.Name()[:len(file.Name())-4]
collection, volumeIdStr := "", baseFileName
if sepIndex := strings.LastIndex(baseFileName, "_"); sepIndex > 0 {
collection = baseFileName[:sepIndex]
volumeIdStr = baseFileName[sepIndex+1:]
}
volumeId, parseErr := strconv.ParseInt(volumeIdStr, 10, 64)
if parseErr != nil {
fmt.Printf("Failed to parse volume id from %s: %v\n", baseFileName, parseErr)
return false
}
if *fixVolumeId != 0 && *fixVolumeId != volumeId {
continue
}
doFixOneVolume(basePath, baseFileName, collection, volumeId, *fixIncludeDeleted)
}
if *fixGenerateEcx {
if !fixEcxFromShardsInDir(basePath, files) {
return false
}
}
}
return true
}
// fixEcxFromShardsInDir finds EC volumes in files (identified by their .ec00
// data shard) and regenerates the .ecx (and .vif when missing) for each,
// honoring the -collection and -volumeId filters.
func fixEcxFromShardsInDir(basePath string, files []fs.DirEntry) bool {
const shard0Ext = ".ec00"
for _, file := range files {
if !strings.HasSuffix(file.Name(), shard0Ext) {
continue
}
if *fixVolumeCollection != "" {
if !strings.HasPrefix(file.Name(), *fixVolumeCollection+"_") {
continue
}
}
baseFileName := file.Name()[:len(file.Name())-len(shard0Ext)]
collection, volumeIdStr := "", baseFileName
if sepIndex := strings.LastIndex(baseFileName, "_"); sepIndex > 0 {
collection = baseFileName[:sepIndex]
volumeIdStr = baseFileName[sepIndex+1:]
}
volumeId, parseErr := strconv.ParseInt(volumeIdStr, 10, 64)
if parseErr != nil {
fmt.Printf("Failed to parse volume id from %s: %v\n", baseFileName, parseErr)
return false
}
if *fixVolumeId != 0 && *fixVolumeId != volumeId {
continue
}
doFixEcxFromShards(basePath, baseFileName, collection, volumeId)
}
return true
}
func SaveToIdx(scaner *VolumeFileScanner4Fix, idxName string) (ret error) {
idxFile, err := os.OpenFile(idxName, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0644)
if err != nil {
return
}
defer func() {
idxFile.Close()
}()
return scaner.nm.AscendingVisit(func(value needle_map.NeedleValue) error {
_, err := idxFile.Write(value.ToBytes())
if scaner.includeDeleted && err == nil {
if deleted, ok := scaner.nmDeleted.Get(value.Key); ok {
_, err = idxFile.Write(deleted.ToBytes())
}
}
return err
})
}
func doFixOneVolume(basepath string, baseFileName string, collection string, volumeId int64, fixIncludeDeleted bool) {
indexFileName := path.Join(basepath, baseFileName+".idx")
nm := needle_map.NewMemDb()
nmDeleted := needle_map.NewMemDb()
defer nm.Close()
defer nmDeleted.Close()
// Validate volumeId range before converting to uint32
if volumeId < 0 || volumeId > 0xFFFFFFFF {
err := fmt.Errorf("volume ID out of range: %d", volumeId)
if *fixIgnoreError {
glog.Error(err)
return
} else {
glog.Fatal(err)
}
}
// lgtm[go/incorrect-integer-conversion]
// Safe conversion: volumeId has been validated to be in range [0, 0xFFFFFFFF] above
vid := needle.VolumeId(volumeId)
scanner := &VolumeFileScanner4Fix{
nm: nm,
nmDeleted: nmDeleted,
includeDeleted: fixIncludeDeleted,
}
if err := storage.ScanVolumeFile(basepath, collection, vid, storage.NeedleMapInMemory, scanner); err != nil {
err := fmt.Errorf("scan .dat File: %w", err)
if *fixIgnoreError {
glog.Error(err)
} else {
glog.Fatal(err)
}
}
if err := SaveToIdx(scanner, indexFileName); err != nil {
err := fmt.Errorf("save to .idx File: %w", err)
if *fixIgnoreError {
glog.Error(err)
} else {
if err := os.Remove(indexFileName); err != nil {
glog.Errorf("failed to cleanup file %s:%v", indexFileName, err)
}
glog.Fatal(err)
}
}
}
// doFixEcxFromShards rebuilds the sealed EC index (.ecx) for one EC volume
// directly from its local shards when both the .ecx and the original .dat are
// gone but the shards survive. When some data shards are missing but at least
// dataShards shards survive in total, the missing shards are first reconstructed
// from the survivors via Reed-Solomon. It then de-stripes the data shards into a
// temporary .dat, scans the needles, and writes a fresh ascending-sorted .ecx
// that matches what WriteSortedFileFromIdx emits at encode time (live entries
// only). When the .vif is also missing it is regenerated from the inferred EC
// ratio and the .dat size discovered during the scan.
func doFixEcxFromShards(basePath, baseFileName, collection string, volumeId int64) {
base := path.Join(basePath, baseFileName)
fail := func(err error) {
if *fixIgnoreError {
glog.Error(err)
} else {
glog.Fatal(err)
}
}
ecxName := base + ".ecx"
if info, err := os.Stat(ecxName); err == nil && info.Size() > 0 {
glog.Infof("volume %d: %s already exists (%d bytes), skipping; remove it first to force regeneration", volumeId, ecxName, info.Size())
return
}
// Discover which shards are present and their common size. Reed-Solomon
// requires every shard to be the same size.
present := make([]bool, erasure_coding.MaxShardCount)
presentCount := 0
maxPresentIdx := -1
var shardSize int64
for i := 0; i < erasure_coding.MaxShardCount; i++ {
info, statErr := os.Stat(base + erasure_coding.ToExt(i))
if statErr != nil || info.Size() == 0 {
continue
}
if shardSize == 0 {
shardSize = info.Size()
} else if info.Size() != shardSize {
fail(fmt.Errorf("volume %d: shard %s size %d does not match %d", volumeId, base+erasure_coding.ToExt(i), info.Size(), shardSize))
return
}
present[i] = true
presentCount++
maxPresentIdx = i
}
if presentCount == 0 {
fail(fmt.Errorf("volume %d: no EC shards found under %s", volumeId, base))
return
}
// Resolve the EC ratio and the original .dat size.
// Priority: explicit flags > existing .vif > defaults (10+4).
vifName := base + ".vif"
vifExists := util.FileExists(vifName)
dataShards := erasure_coding.DataShardsCount
parityShards := erasure_coding.ParityShardsCount
var datFileSize int64
if vifExists {
// MaybeLoadVolumeInfo returns a non-nil error when the .vif exists but
// cannot be read or unmarshalled; fail loudly rather than silently
// falling back to defaults (which would be wrong for a custom ratio).
if vi, _, found, loadErr := volume_info.MaybeLoadVolumeInfo(vifName); loadErr != nil {
fail(fmt.Errorf("volume %d: read %s: %w", volumeId, vifName, loadErr))
return
} else if found && vi != nil {
if cfg := vi.GetEcShardConfig(); cfg != nil && cfg.GetDataShards() > 0 {
dataShards = int(cfg.GetDataShards())
parityShards = int(cfg.GetParityShards())
}
datFileSize = vi.GetDatFileSize()
}
}
if *fixEcDataShards > 0 {
dataShards = *fixEcDataShards
}
if *fixEcParityShards > 0 {
parityShards = *fixEcParityShards
}
// Ensure the configured total covers every shard index actually present
// (a custom-ratio volume with more than the default 14 shards and no .vif).
// This never lowers parity below the default, so the common 10+4 case stays
// correct for any subset of missing shards.
if maxPresentIdx+1 > dataShards+parityShards {
parityShards = maxPresentIdx + 1 - dataShards
}
if dataShards <= 0 || parityShards <= 0 || dataShards+parityShards > erasure_coding.MaxShardCount {
fail(fmt.Errorf("volume %d: cannot determine EC ratio (data=%d parity=%d); set -ecDataShards/-ecParityShards", volumeId, dataShards, parityShards))
return
}
// Need at least dataShards shards (any data+parity mix) to recover anything.
if presentCount < dataShards {
fail(fmt.Errorf("volume %d: only %d shards present, need at least %d (data shards) to recover", volumeId, presentCount, dataShards))
return
}
// If any data shard is missing, reconstruct the missing shards from the
// survivors via Reed-Solomon before de-striping. This writes the rebuilt
// shard files back to disk, fully repairing the volume locally.
dataComplete := true
for i := 0; i < dataShards; i++ {
if !present[i] {
dataComplete = false
break
}
}
if !dataComplete {
ctx := &erasure_coding.ECContext{DataShards: dataShards, ParityShards: parityShards}
glog.Infof("volume %d: %d/%d shards present; reconstructing missing shards (%s) before index rebuild", volumeId, presentCount, dataShards+parityShards, ctx.String())
if _, err := erasure_coding.RebuildEcFilesWithContext(base, ctx); err != nil {
fail(fmt.Errorf("volume %d: reconstruct missing shards from %d survivors: %w", volumeId, presentCount, err))
return
}
}
// Collect the data shards (now all present).
shardFileNames := make([]string, dataShards)
for i := 0; i < dataShards; i++ {
shardPath := base + erasure_coding.ToExt(i)
if !util.FileExists(shardPath) {
fail(fmt.Errorf("volume %d: data shard %s still missing after reconstruction", volumeId, shardPath))
return
}
shardFileNames[i] = shardPath
}
// Without a recorded original size, reconstruct the fully padded layout; the
// scan below detects the trailing zero padding and recovers the true size.
reconstructSize := datFileSize
if reconstructSize <= 0 {
reconstructSize = int64(dataShards) * shardSize
glog.V(0).Infof("volume %d: no .dat size in .vif; reconstructing padded .dat (%d bytes) from %d data shards", volumeId, reconstructSize, dataShards)
}
// De-stripe the data shards into a temporary .dat next to the shards.
tmpBase := base + ".ecxrecover"
tmpDat := tmpBase + ".dat"
if err := erasure_coding.WriteDatFile(tmpBase, reconstructSize, shardFileNames); err != nil {
os.Remove(tmpDat)
fail(fmt.Errorf("volume %d: reconstruct .dat from data shards: %w", volumeId, err))
return
}
defer os.Remove(tmpDat)
realDatSize, version, err := writeEcxFromDat(tmpDat, ecxName)
if err != nil {
os.Remove(ecxName)
fail(fmt.Errorf("volume %d: build .ecx from reconstructed .dat: %w", volumeId, err))
return
}
glog.Infof("volume %d: wrote %s from %d data shards", volumeId, ecxName, dataShards)
// Regenerate the .vif when missing so the volume can mount and future
// rebuilds know the EC ratio and original .dat size.
if !vifExists {
size := datFileSize
if size <= 0 {
size = realDatSize
}
volumeInfo := &volume_server_pb.VolumeInfo{
Version: uint32(version),
DatFileSize: size,
EcShardConfig: &volume_server_pb.EcShardConfig{
DataShards: uint32(dataShards),
ParityShards: uint32(parityShards),
},
}
if err := volume_info.SaveVolumeInfo(vifName, volumeInfo); err != nil {
fail(fmt.Errorf("volume %d: write %s: %w", volumeId, vifName, err))
return
}
glog.Infof("volume %d: wrote %s (version %d, datFileSize %d, ec %d+%d)", volumeId, vifName, version, size, dataShards, parityShards)
}
}
// writeEcxFromDat scans a (reconstructed) .dat and writes an ascending-sorted
// .ecx containing only live needles — the same on-disk shape
// WriteSortedFileFromIdx produces when an EC volume is first encoded. It returns
// the physical .dat size (the offset where the EC zero padding begins) and the
// volume version read from the superblock.
func writeEcxFromDat(datPath, ecxPath string) (datFileSize int64, version needle.Version, err error) {
f, err := os.OpenFile(datPath, os.O_RDONLY, 0644)
if err != nil {
return 0, 0, fmt.Errorf("open %s: %w", datPath, err)
}
datBackend := backend.NewDiskFile(f)
defer datBackend.Close()
superBlock, err := super_block.ReadSuperBlock(datBackend)
if err != nil {
return 0, 0, fmt.Errorf("read superblock: %w", err)
}
version = superBlock.Version
fileSize, _, err := datBackend.GetStat()
if err != nil {
return 0, version, fmt.Errorf("stat %s: %w", datPath, err)
}
nm := needle_map.NewMemDb()
defer nm.Close()
offset := int64(superBlock.BlockSize())
for offset < fileSize {
n, _, rest, readErr := needle.ReadNeedleHeader(datBackend, version, offset)
if readErr != nil {
if readErr == io.EOF {
break
}
return 0, version, fmt.Errorf("read needle header at offset %d: %w", offset, readErr)
}
// EC encoding zero-pads the tail of the last block row. An all-zero
// header marks the start of that padding, i.e. the end of real needles.
if n.Cookie == 0 && n.Id == 0 && n.Size == 0 {
break
}
if n.Size.IsValid() {
if pe := nm.Set(n.Id, types.ToOffset(offset), n.Size); pe != nil {
return 0, version, fmt.Errorf("set needle %d: %w", n.Id, pe)
}
} else {
// Deleted/invalid: drop it so the .ecx carries only live entries,
// matching the encode-time WriteSortedFileFromIdx behavior.
if pe := nm.Delete(n.Id); pe != nil {
return 0, version, fmt.Errorf("delete needle %d: %w", n.Id, pe)
}
}
offset += types.NeedleHeaderSize + rest
}
datFileSize = offset
ecxFile, err := os.OpenFile(ecxPath, os.O_TRUNC|os.O_CREATE|os.O_WRONLY, 0644)
if err != nil {
return 0, version, fmt.Errorf("open %s: %w", ecxPath, err)
}
defer ecxFile.Close()
if err := nm.AscendingVisit(func(value needle_map.NeedleValue) error {
_, writeErr := ecxFile.Write(value.ToBytes())
return writeErr
}); err != nil {
return 0, version, fmt.Errorf("write %s: %w", ecxPath, err)
}
return datFileSize, version, nil
}