Files
seaweedfs/weed/shell/command_volume_vacuum.go
Chris Lu 08a7502b2c fix(shell): error on missing volume id in fsck, mergeVolumes, vacuum (#9158)
* fix(shell): error on missing volume id in fsck, mergeVolumes, vacuum

Three shell commands silently report success when -volumeId /
-fromVolumeId / -toVolumeId names a volume the master doesn't know
about: typos, already-deleted volumes, and stale scripts all look
identical to a clean no-op, which is what made the confusion in #9116
take as long as it did to diagnose.

- volume.fsck: filter at the per-datanode loop drops unknown ids and
  findExtraChunksInVolumeServers ends with totalOrphanChunkCount==0,
  printing "no orphan data".
- fs.mergeVolumes: createMergePlan iterates only known volumes, so an
  unknown -fromVolumeId produces an empty plan and we print just the
  "max volume size: N MB" header (indistinguishable from "nothing to
  merge").
- volume.vacuum: the master's VacuumVolume RPC silently iterates
  matching volumes; a missing id returns success having done nothing.

Validate the requested ids against the current topology up front and
return an explicit "volume(s) not found on master: [X Y]" error. Also
drop a stale duplicate `if err != nil` in volume.fsck.Do left over from
a prior refactor.

Surfaces #9116 follow-up from madalee-com.

* address review: propagate reloadVolumesInfo error; dedupe vacuum missing ids

- fs.mergeVolumes: c.reloadVolumesInfo's return was ignored. If the
  master is unreachable or VolumeList fails, c.volumes stays empty and
  the new validation block reports "fromVolumeId X not found on master"
  — masking the real connection/RPC failure. Return the wrapped error
  instead.

- volume.vacuum: "volume.vacuum -volumeId 5,5,5" on a missing volume 5
  listed [5 5 5] in the error. Collect missing ids in a set so each
  missing id appears once.

* address review: reject fromVolumeId/toVolumeId values that overflow uint32

flag.Uint produces a uint (64-bit on amd64), and the existing cast to
needle.VolumeId silently truncates to uint32. A typo like
`-fromVolumeId=4294967297` would wrap to volume 1 and slip past every
other validation, so the merge would run against a completely
different volume than the operator intended.

Bail out with an explicit error when the raw flag value exceeds the
uint32 range, before the cast.
2026-04-20 15:32:31 -07:00

117 lines
3.2 KiB
Go

package shell
import (
"context"
"flag"
"fmt"
"io"
"sort"
"strconv"
"strings"
"github.com/seaweedfs/seaweedfs/weed/pb/master_pb"
)
func init() {
Commands = append(Commands, &commandVacuum{})
}
type commandVacuum struct {
}
func (c *commandVacuum) Name() string {
return "volume.vacuum"
}
func (c *commandVacuum) Help() string {
return `compact volumes if deleted entries are more than the limit
volume.vacuum [-garbageThreshold=0.3] [-collection=<collection name>] [-volumeId=<volume id>]
`
}
func (c *commandVacuum) HasTag(CommandTag) bool {
return false
}
func (c *commandVacuum) Do(args []string, commandEnv *CommandEnv, writer io.Writer) (err error) {
volumeVacuumCommand := flag.NewFlagSet(c.Name(), flag.ContinueOnError)
garbageThreshold := volumeVacuumCommand.Float64("garbageThreshold", 0.3, "vacuum when garbage is more than this limit")
collection := volumeVacuumCommand.String("collection", "", "vacuum this collection")
volumeIds := volumeVacuumCommand.String("volumeId", "", "comma-separated list of volume IDs")
if err = volumeVacuumCommand.Parse(args); err != nil {
return nil
}
if err = commandEnv.confirmIsLocked(args); err != nil {
return
}
var volumeIdInts []uint32
if *volumeIds != "" {
for _, volumeIdStr := range strings.Split(*volumeIds, ",") {
volumeIdStr = strings.TrimSpace(volumeIdStr)
if volumeIdInt, err := strconv.ParseUint(volumeIdStr, 10, 32); err == nil {
volumeIdInts = append(volumeIdInts, uint32(volumeIdInt))
} else {
return fmt.Errorf("parse volumeId string %s to int: %v", volumeIdStr, err)
}
}
} else {
volumeIdInts = append(volumeIdInts, 0)
}
// Reject unknown ids up front. The master's VacuumVolume RPC silently
// iterates matching volumes, so a typo or an already-deleted volume just
// returns success — making this command look like it worked when nothing
// happened.
if *volumeIds != "" {
topo, _, err := collectTopologyInfo(commandEnv, 0)
if err != nil {
return fmt.Errorf("collect topology: %w", err)
}
known := make(map[uint32]bool)
eachDataNode(topo, func(_ DataCenterId, _ RackId, dn *master_pb.DataNodeInfo) {
for _, disk := range dn.DiskInfos {
for _, vi := range disk.VolumeInfos {
known[vi.Id] = true
}
}
})
// Dedupe via a set so "volume.vacuum -volumeId 5,5,5" on a missing
// volume 5 reports [5] once instead of [5 5 5].
missingSet := make(map[uint32]bool)
for _, vid := range volumeIdInts {
if !known[vid] {
missingSet[vid] = true
}
}
if len(missingSet) > 0 {
missing := make([]uint32, 0, len(missingSet))
for vid := range missingSet {
missing = append(missing, vid)
}
sort.Slice(missing, func(i, j int) bool { return missing[i] < missing[j] })
return fmt.Errorf("volume(s) not found on master: %v", missing)
}
}
for _, volumeId := range volumeIdInts {
err = commandEnv.MasterClient.WithClient(false, func(client master_pb.SeaweedClient) error {
_, err = client.VacuumVolume(context.Background(), &master_pb.VacuumVolumeRequest{
GarbageThreshold: float32(*garbageThreshold),
VolumeId: volumeId,
Collection: *collection,
})
return err
})
if err != nil {
return err
}
}
return nil
}