From 0f1e50f9ec08587b9d2fdfb2d98c61ede152f7ba Mon Sep 17 00:00:00 2001 From: Chris Lu Date: Sun, 24 May 2026 15:11:09 -0700 Subject: [PATCH] fix(master): re-register volumes missing from the lookup index A disconnect/reconnect race could drop a volume from vid2location while it stayed in the data node's disk map, so it showed in volume.list and the admin UI but LookupVolume returned "volume id not found" and never self-healed (the full heartbeat only registered volumes new to the disk map). The full heartbeat now re-registers any reported volume missing from the lookup index, reusing the already-resolved VolumeLayout. --- weed/topology/topology.go | 13 +++++++++ weed/topology/topology_test.go | 53 ++++++++++++++++++++++++++++++++++ weed/topology/volume_layout.go | 32 +++++++++++++++++++- 3 files changed, 97 insertions(+), 1 deletion(-) diff --git a/weed/topology/topology.go b/weed/topology/topology.go index b99bbd816..ddba44d2d 100644 --- a/weed/topology/topology.go +++ b/weed/topology/topology.go @@ -586,6 +586,19 @@ func (t *Topology) SyncDataNodeRegistration(volumes []*master_pb.VolumeInformati } diskType := types.ToDiskType(v.DiskType) vl := t.GetVolumeLayout(v.Collection, v.ReplicaPlacement, v.Ttl, diskType) + // Self-heal: a volume reported by the data node but missing from the + // lookup index is re-registered. This repairs the split left by a + // disconnect/reconnect race, where UnRegisterDataNode dropped the volume + // from vid2location but the reconnecting full heartbeat skipped it + // (still in the disk map, so UpdateVolumes did not report it as new). + // Without this, the volume stays visible in volume.list/admin UI yet + // LookupVolume returns "volume id not found". + if !vl.HasDataNode(v.Id, dn) { + // vl is already resolved above; call it directly instead of + // RegisterVolumeLayout, which would repeat the GetVolumeLayout lookup. + vl.RegisterVolume(&v, dn) + vl.EnsureCorrectWritables(&v) + } if vl.UpdateVolumeSize(v.Id, v.Size, v.CompactRevision) { vl.AdjustActiveVolumeCountAfterRecovery(v.Id) } diff --git a/weed/topology/topology_test.go b/weed/topology/topology_test.go index 415a7198d..99124a600 100644 --- a/weed/topology/topology_test.go +++ b/weed/topology/topology_test.go @@ -573,3 +573,56 @@ func TestLookupDataNodeByAddress(t *testing.T) { t.Fatalf("address must be unregistered after UnRegisterDataNode, got %v", got) } } + +// TestSyncDataNodeRegistrationReRegistersMissingVolume reproduces the divergence +// where a volume present on a data node (shown by volume.list / admin UI) is +// missing from the lookup index, which surfaces as "volume id not found" on +// LookupVolume. SetVolumeUnavailable (used by +// UnRegisterDataNode on a disconnect) drops the volume from the index, and the +// reconnecting full heartbeat used to skip it because it was no longer "new" to +// the disk map. The full heartbeat now self-heals. +func TestSyncDataNodeRegistrationReRegistersMissingVolume(t *testing.T) { + topo := NewTopology("weedfs", sequence.NewMemorySequencer(), 32*1024, 5, false) + + dc := topo.GetOrCreateDataCenter("dc1") + rack := dc.GetOrCreateRack("rack1") + dn := rack.GetOrCreateDataNode("127.0.0.1", 34534, 0, "127.0.0.1", "", map[string]uint32{"": 25}) + + vid := needle.VolumeId(18994) + volumeMessage := &master_pb.VolumeInformationMessage{ + Id: uint32(vid), + Size: 100, + Collection: "drr", + ReplicaPlacement: uint32(0), + Version: uint32(needle.GetCurrentVersion()), + Ttl: 0, + } + + // Initial full heartbeat registers the volume in the lookup index. + topo.SyncDataNodeRegistration([]*master_pb.VolumeInformationMessage{volumeMessage}, dn) + if got := topo.Lookup("", vid); len(got) != 1 { + t.Fatalf("after registration: lookup %d got %v, want 1 location", vid, got) + } + + // Drop the volume from the index the way UnRegisterDataNode does, but leave + // it in the data node's disk map (the reconnecting heartbeat did not report + // it as new). + rp, _ := super_block.NewReplicaPlacementFromString("000") + vl := topo.GetVolumeLayout("drr", rp, needle.EMPTY_TTL, types.HardDriveType) + vl.SetVolumeUnavailable(dn, vid) + + // The empty entry must be removed, otherwise Lookup returns a non-nil empty + // list that still reads as "not found". + if got := topo.Lookup("", vid); got != nil { + t.Fatalf("after SetVolumeUnavailable: expected lookup miss, got %v", got) + } + if _, err := dn.GetVolumesById(vid); err != nil { + t.Fatalf("volume %d should still be in the data node disk map: %v", vid, err) + } + + // The next full heartbeat re-registers the volume even though it is not new. + topo.SyncDataNodeRegistration([]*master_pb.VolumeInformationMessage{volumeMessage}, dn) + if got := topo.Lookup("", vid); len(got) != 1 { + t.Fatalf("after self-heal: lookup %d got %v, want 1 location", vid, got) + } +} diff --git a/weed/topology/volume_layout.go b/weed/topology/volume_layout.go index b25097e71..2beedd82e 100644 --- a/weed/topology/volume_layout.go +++ b/weed/topology/volume_layout.go @@ -502,6 +502,25 @@ func (vl *VolumeLayout) Lookup(vid needle.VolumeId) []*DataNode { return nil } +// HasDataNode reports whether the layout already lists dn as a location for vid. +// Used to detect a volume that is present on a data node but missing from the +// lookup index, so it can be re-registered. +func (vl *VolumeLayout) HasDataNode(vid needle.VolumeId, dn *DataNode) bool { + vl.accessLock.RLock() + defer vl.accessLock.RUnlock() + + location, ok := vl.vid2location[vid] + if !ok { + return false + } + for _, n := range location.list { + if n.Ip == dn.Ip && n.Port == dn.Port { + return true + } + } + return false +} + func (vl *VolumeLayout) ListVolumeServers() (nodes []*DataNode) { vl.accessLock.RLock() defer vl.accessLock.RUnlock() @@ -750,10 +769,21 @@ func (vl *VolumeLayout) SetVolumeUnavailable(dn *DataNode, vid needle.VolumeId) if location.Remove(dn) { vl.readonlyVolumes.Remove(vid, dn) vl.oversizedVolumes.Remove(vid, dn) + wasWritable := false if location.Length() < vl.rp.GetCopyCount() { glog.V(0).Infoln("Volume", vid, "has", location.Length(), "replica, less than required", vl.rp.GetCopyCount()) - return vl.removeFromWritable(vid) + wasWritable = vl.removeFromWritable(vid) } + if location.Length() == 0 { + // Drop the now-empty entry. Otherwise Lookup returns a non-nil + // empty location list, which surfaces as "volume id not found" + // even though the volume still appears in volume.list/admin UI. + // Mirrors UnRegisterVolume. + delete(vl.vid2location, vid) + delete(vl.sizeTracking, vid) + vl.removeFromCrowded(vid) + } + return wasWritable } } return false