mirror of
https://github.com/seaweedfs/seaweedfs.git
synced 2026-05-30 21:46:21 +00:00
fix(master): re-register volumes missing from the lookup index
A disconnect/reconnect race could drop a volume from vid2location while it stayed in the data node's disk map, so it showed in volume.list and the admin UI but LookupVolume returned "volume id not found" and never self-healed (the full heartbeat only registered volumes new to the disk map). The full heartbeat now re-registers any reported volume missing from the lookup index, reusing the already-resolved VolumeLayout.
This commit is contained in:
@@ -586,6 +586,19 @@ func (t *Topology) SyncDataNodeRegistration(volumes []*master_pb.VolumeInformati
|
||||
}
|
||||
diskType := types.ToDiskType(v.DiskType)
|
||||
vl := t.GetVolumeLayout(v.Collection, v.ReplicaPlacement, v.Ttl, diskType)
|
||||
// Self-heal: a volume reported by the data node but missing from the
|
||||
// lookup index is re-registered. This repairs the split left by a
|
||||
// disconnect/reconnect race, where UnRegisterDataNode dropped the volume
|
||||
// from vid2location but the reconnecting full heartbeat skipped it
|
||||
// (still in the disk map, so UpdateVolumes did not report it as new).
|
||||
// Without this, the volume stays visible in volume.list/admin UI yet
|
||||
// LookupVolume returns "volume id not found".
|
||||
if !vl.HasDataNode(v.Id, dn) {
|
||||
// vl is already resolved above; call it directly instead of
|
||||
// RegisterVolumeLayout, which would repeat the GetVolumeLayout lookup.
|
||||
vl.RegisterVolume(&v, dn)
|
||||
vl.EnsureCorrectWritables(&v)
|
||||
}
|
||||
if vl.UpdateVolumeSize(v.Id, v.Size, v.CompactRevision) {
|
||||
vl.AdjustActiveVolumeCountAfterRecovery(v.Id)
|
||||
}
|
||||
|
||||
@@ -573,3 +573,56 @@ func TestLookupDataNodeByAddress(t *testing.T) {
|
||||
t.Fatalf("address must be unregistered after UnRegisterDataNode, got %v", got)
|
||||
}
|
||||
}
|
||||
|
||||
// TestSyncDataNodeRegistrationReRegistersMissingVolume reproduces the divergence
|
||||
// where a volume present on a data node (shown by volume.list / admin UI) is
|
||||
// missing from the lookup index, which surfaces as "volume id not found" on
|
||||
// LookupVolume. SetVolumeUnavailable (used by
|
||||
// UnRegisterDataNode on a disconnect) drops the volume from the index, and the
|
||||
// reconnecting full heartbeat used to skip it because it was no longer "new" to
|
||||
// the disk map. The full heartbeat now self-heals.
|
||||
func TestSyncDataNodeRegistrationReRegistersMissingVolume(t *testing.T) {
|
||||
topo := NewTopology("weedfs", sequence.NewMemorySequencer(), 32*1024, 5, false)
|
||||
|
||||
dc := topo.GetOrCreateDataCenter("dc1")
|
||||
rack := dc.GetOrCreateRack("rack1")
|
||||
dn := rack.GetOrCreateDataNode("127.0.0.1", 34534, 0, "127.0.0.1", "", map[string]uint32{"": 25})
|
||||
|
||||
vid := needle.VolumeId(18994)
|
||||
volumeMessage := &master_pb.VolumeInformationMessage{
|
||||
Id: uint32(vid),
|
||||
Size: 100,
|
||||
Collection: "drr",
|
||||
ReplicaPlacement: uint32(0),
|
||||
Version: uint32(needle.GetCurrentVersion()),
|
||||
Ttl: 0,
|
||||
}
|
||||
|
||||
// Initial full heartbeat registers the volume in the lookup index.
|
||||
topo.SyncDataNodeRegistration([]*master_pb.VolumeInformationMessage{volumeMessage}, dn)
|
||||
if got := topo.Lookup("", vid); len(got) != 1 {
|
||||
t.Fatalf("after registration: lookup %d got %v, want 1 location", vid, got)
|
||||
}
|
||||
|
||||
// Drop the volume from the index the way UnRegisterDataNode does, but leave
|
||||
// it in the data node's disk map (the reconnecting heartbeat did not report
|
||||
// it as new).
|
||||
rp, _ := super_block.NewReplicaPlacementFromString("000")
|
||||
vl := topo.GetVolumeLayout("drr", rp, needle.EMPTY_TTL, types.HardDriveType)
|
||||
vl.SetVolumeUnavailable(dn, vid)
|
||||
|
||||
// The empty entry must be removed, otherwise Lookup returns a non-nil empty
|
||||
// list that still reads as "not found".
|
||||
if got := topo.Lookup("", vid); got != nil {
|
||||
t.Fatalf("after SetVolumeUnavailable: expected lookup miss, got %v", got)
|
||||
}
|
||||
if _, err := dn.GetVolumesById(vid); err != nil {
|
||||
t.Fatalf("volume %d should still be in the data node disk map: %v", vid, err)
|
||||
}
|
||||
|
||||
// The next full heartbeat re-registers the volume even though it is not new.
|
||||
topo.SyncDataNodeRegistration([]*master_pb.VolumeInformationMessage{volumeMessage}, dn)
|
||||
if got := topo.Lookup("", vid); len(got) != 1 {
|
||||
t.Fatalf("after self-heal: lookup %d got %v, want 1 location", vid, got)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -502,6 +502,25 @@ func (vl *VolumeLayout) Lookup(vid needle.VolumeId) []*DataNode {
|
||||
return nil
|
||||
}
|
||||
|
||||
// HasDataNode reports whether the layout already lists dn as a location for vid.
|
||||
// Used to detect a volume that is present on a data node but missing from the
|
||||
// lookup index, so it can be re-registered.
|
||||
func (vl *VolumeLayout) HasDataNode(vid needle.VolumeId, dn *DataNode) bool {
|
||||
vl.accessLock.RLock()
|
||||
defer vl.accessLock.RUnlock()
|
||||
|
||||
location, ok := vl.vid2location[vid]
|
||||
if !ok {
|
||||
return false
|
||||
}
|
||||
for _, n := range location.list {
|
||||
if n.Ip == dn.Ip && n.Port == dn.Port {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func (vl *VolumeLayout) ListVolumeServers() (nodes []*DataNode) {
|
||||
vl.accessLock.RLock()
|
||||
defer vl.accessLock.RUnlock()
|
||||
@@ -750,10 +769,21 @@ func (vl *VolumeLayout) SetVolumeUnavailable(dn *DataNode, vid needle.VolumeId)
|
||||
if location.Remove(dn) {
|
||||
vl.readonlyVolumes.Remove(vid, dn)
|
||||
vl.oversizedVolumes.Remove(vid, dn)
|
||||
wasWritable := false
|
||||
if location.Length() < vl.rp.GetCopyCount() {
|
||||
glog.V(0).Infoln("Volume", vid, "has", location.Length(), "replica, less than required", vl.rp.GetCopyCount())
|
||||
return vl.removeFromWritable(vid)
|
||||
wasWritable = vl.removeFromWritable(vid)
|
||||
}
|
||||
if location.Length() == 0 {
|
||||
// Drop the now-empty entry. Otherwise Lookup returns a non-nil
|
||||
// empty location list, which surfaces as "volume id not found"
|
||||
// even though the volume still appears in volume.list/admin UI.
|
||||
// Mirrors UnRegisterVolume.
|
||||
delete(vl.vid2location, vid)
|
||||
delete(vl.sizeTracking, vid)
|
||||
vl.removeFromCrowded(vid)
|
||||
}
|
||||
return wasWritable
|
||||
}
|
||||
}
|
||||
return false
|
||||
|
||||
Reference in New Issue
Block a user