diff --git a/weed/shell/command_volume_list.go b/weed/shell/command_volume_list.go index 590ca399e..a8ddb4c54 100644 --- a/weed/shell/command_volume_list.go +++ b/weed/shell/command_volume_list.go @@ -132,9 +132,92 @@ func (c *commandVolumeList) writeTopologyInfo(writer io.Writer, t *master_pb.Top s.add(c.writeDataCenterInfo(writer, dc, verbosityLevel)) } output(verbosityLevel >= 0, writer, "%+v \n", s) + // Scan the full topology (ignoring any -collection/-volumeId/-dataCenter + // filters) so this cluster-health warning always fires. + writeDuplicateVolumeIdWarning(writer, findDuplicateVolumeIds(t)) return s } +// findDuplicateVolumeIds returns volume ids that appear under more than one +// collection, mapped to the sorted list of collections they live in. Volume +// ids are meant to be globally unique; a duplicate means the master handed out +// an id already in use by another collection (historically after losing its +// max-volume-id counter on restart, see the master resumeState flag). Such ids +// are dangerous: collection.delete on one collection destroys that collection's +// copy, and any operation keyed on the bare id (lookup, move, vacuum) is +// ambiguous. Both normal volumes and EC shards are scanned. Replicas of the +// same volume share a collection, so they collapse into a single entry and do +// not register as duplicates. +func findDuplicateVolumeIds(t *master_pb.TopologyInfo) map[uint32][]string { + // Duplicates are rare, so track only the first collection seen per id and + // allocate a set lazily on the first clash. Keeps allocations O(duplicates) + // rather than O(volumes), which matters on clusters with millions of ids. + firstCollectionByVid := make(map[uint32]string) + collectionsByVid := make(map[uint32]map[string]struct{}) + note := func(vid uint32, collection string) { + if collections := collectionsByVid[vid]; collections != nil { + collections[collection] = struct{}{} + return + } + first, seen := firstCollectionByVid[vid] + if !seen { + firstCollectionByVid[vid] = collection + return + } + if first == collection { + return + } + collectionsByVid[vid] = map[string]struct{}{first: {}, collection: {}} + } + for _, dc := range t.DataCenterInfos { + for _, rack := range dc.RackInfos { + for _, dn := range rack.DataNodeInfos { + for _, disk := range dn.DiskInfos { + for _, vi := range disk.VolumeInfos { + note(vi.Id, vi.Collection) + } + for _, ec := range disk.EcShardInfos { + note(ec.Id, ec.Collection) + } + } + } + } + } + duplicates := make(map[uint32][]string) + for vid, collections := range collectionsByVid { + names := make([]string, 0, len(collections)) + for name := range collections { + names = append(names, name) + } + sort.Strings(names) + duplicates[vid] = names + } + return duplicates +} + +func writeDuplicateVolumeIdWarning(writer io.Writer, duplicates map[uint32][]string) { + if len(duplicates) == 0 { + return + } + vids := make([]uint32, 0, len(duplicates)) + for vid := range duplicates { + vids = append(vids, vid) + } + slices.Sort(vids) + fmt.Fprintf(writer, "\nWARNING: %d volume id(s) exist in more than one collection. "+ + "Deleting one collection (e.g. collection.delete) will destroy that collection's data on these shared ids, "+ + "and lookups/moves on the bare id are ambiguous. Verify before any destructive operation:\n", len(vids)) + for _, vid := range vids { + collections := duplicates[vid] + for i, name := range collections { + if name == "" { + collections[i] = `"" (default)` + } + } + fmt.Fprintf(writer, " volume %d in collections: %s\n", vid, strings.Join(collections, ", ")) + } +} + func (c *commandVolumeList) writeDataCenterInfo(writer io.Writer, t *master_pb.DataCenterInfo, verbosityLevel int) statistics { var s statistics slices.SortFunc(t.RackInfos, func(a, b *master_pb.RackInfo) int { diff --git a/weed/shell/command_volume_list_test.go b/weed/shell/command_volume_list_test.go index 6b1bc4583..757de239e 100644 --- a/weed/shell/command_volume_list_test.go +++ b/weed/shell/command_volume_list_test.go @@ -193,3 +193,89 @@ func TestWriteDataNodeInfo_SplitsCollapsedDisksByPhysicalDiskId(t *testing.T) { t.Errorf("DataNode header callback ran %d times; want 1 (regression: header printed once per split disk)", rackInvocations) } } + +// volNode builds a single-node topology from (volumeId, collection) pairs so the +// duplicate-detection tests can describe a cluster compactly. +func volNode(nodeId string, volumes ...*master_pb.VolumeInformationMessage) *master_pb.DataNodeInfo { + return &master_pb.DataNodeInfo{ + Id: nodeId, + DiskInfos: map[string]*master_pb.DiskInfo{ + "hdd": {Type: "hdd", VolumeInfos: volumes}, + }, + } +} + +func topoFromNodes(nodes ...*master_pb.DataNodeInfo) *master_pb.TopologyInfo { + return &master_pb.TopologyInfo{ + DataCenterInfos: []*master_pb.DataCenterInfo{{ + Id: "dc1", + RackInfos: []*master_pb.RackInfo{{Id: "rack1", DataNodeInfos: nodes}}, + }}, + } +} + +// TestFindDuplicateVolumeIds covers the dangerous condition where the master +// reused a volume id across two collections (e.g. id 59788 in both infra-backup +// and ai-news-data-infrastructure) — the state that lets collection.delete +// destroy the wrong collection's data. The detector must flag exactly the shared +// ids, while same-id replicas of one volume must NOT be flagged. +func TestFindDuplicateVolumeIds(t *testing.T) { + t.Run("flags id shared across collections", func(t *testing.T) { + topo := topoFromNodes( + volNode("srv0487:8082", &master_pb.VolumeInformationMessage{Id: 59788, Collection: "infra-backup"}), + volNode("srv0502:8083", &master_pb.VolumeInformationMessage{Id: 59788, Collection: "ai-news-data-infrastructure"}), + // A clean, single-collection volume that must not be flagged. + volNode("srv0487:8085", &master_pb.VolumeInformationMessage{Id: 7069, Collection: "ai-news-data-infrastructure"}), + ) + dup := findDuplicateVolumeIds(topo) + assert.Equal(t, map[uint32][]string{ + 59788: {"ai-news-data-infrastructure", "infra-backup"}, + }, dup) + }) + + t.Run("replicas of one volume are not duplicates", func(t *testing.T) { + // Same id on three nodes, all the same collection: a normal replicated + // volume, not a cross-collection clash. + topo := topoFromNodes( + volNode("n1", &master_pb.VolumeInformationMessage{Id: 42, Collection: "c"}), + volNode("n2", &master_pb.VolumeInformationMessage{Id: 42, Collection: "c"}), + volNode("n3", &master_pb.VolumeInformationMessage{Id: 42, Collection: "c"}), + ) + assert.Empty(t, findDuplicateVolumeIds(topo)) + }) + + t.Run("default empty collection counts as its own collection", func(t *testing.T) { + topo := topoFromNodes( + volNode("n1", &master_pb.VolumeInformationMessage{Id: 1, Collection: ""}), + volNode("n2", &master_pb.VolumeInformationMessage{Id: 1, Collection: "x"}), + ) + assert.Equal(t, map[uint32][]string{1: {"", "x"}}, findDuplicateVolumeIds(topo)) + }) + + t.Run("normal volume and ec shard sharing an id across collections", func(t *testing.T) { + topo := topoFromNodes(volNode("n1", &master_pb.VolumeInformationMessage{Id: 5, Collection: "a"})) + topo.DataCenterInfos[0].RackInfos[0].DataNodeInfos[0].DiskInfos["hdd"].EcShardInfos = + []*master_pb.VolumeEcShardInformationMessage{{Id: 5, Collection: "b", EcIndexBits: 1}} + assert.Equal(t, map[uint32][]string{5: {"a", "b"}}, findDuplicateVolumeIds(topo)) + }) +} + +func TestWriteDuplicateVolumeIdWarning(t *testing.T) { + var buf bytes.Buffer + writeDuplicateVolumeIdWarning(&buf, map[uint32][]string{ + 59788: {"ai-news-data-infrastructure", "infra-backup"}, + 1: {"", "x"}, + }) + out := buf.String() + assert.Contains(t, out, "WARNING: 2 volume id(s) exist in more than one collection") + assert.Contains(t, out, "volume 59788 in collections: ai-news-data-infrastructure, infra-backup") + // Empty collection name is rendered readably rather than as a blank. + assert.Contains(t, out, `volume 1 in collections: "" (default), x`) + // Lowest id first. + assert.Less(t, strings.Index(out, "volume 1 "), strings.Index(out, "volume 59788 ")) + + // Clean topology prints nothing. + var empty bytes.Buffer + writeDuplicateVolumeIdWarning(&empty, nil) + assert.Empty(t, empty.String()) +}