From bfedea9bad44db0463e80c88b7606d73216898e2 Mon Sep 17 00:00:00 2001 From: Harshavardhana Date: Mon, 27 Feb 2023 04:55:32 -0800 Subject: [PATCH] fix: disk healing should honor the right pool/set index (#16712) --- cmd/background-newdisks-heal-ops.go | 33 +++++++++++++++-------------- cmd/global-heal.go | 30 +++++++++++++++++++++----- 2 files changed, 42 insertions(+), 21 deletions(-) diff --git a/cmd/background-newdisks-heal-ops.go b/cmd/background-newdisks-heal-ops.go index 2dc11d9d4..f904ff4d5 100644 --- a/cmd/background-newdisks-heal-ops.go +++ b/cmd/background-newdisks-heal-ops.go @@ -299,7 +299,7 @@ func healFreshDisk(ctx context.Context, z *erasureServerPools, endpoint Endpoint defer disk.Close() poolIdx := globalEndpoints.GetLocalPoolIdx(disk.Endpoint()) if poolIdx < 0 { - return fmt.Errorf("unexpected pool index (%d) found in %s", poolIdx, disk.Endpoint()) + return fmt.Errorf("unexpected pool index (%d) found for %s", poolIdx, disk.Endpoint()) } // Calculate the set index where the current endpoint belongs @@ -310,14 +310,15 @@ func healFreshDisk(ctx context.Context, z *erasureServerPools, endpoint Endpoint return err } if setIdx < 0 { - return fmt.Errorf("unexpected set index (%d) found in %s", setIdx, disk.Endpoint()) + return fmt.Errorf("unexpected set index (%d) found for %s", setIdx, disk.Endpoint()) } // Prevent parallel erasure set healing locker := z.NewNSLock(minioMetaBucket, fmt.Sprintf("new-drive-healing/%d/%d", poolIdx, setIdx)) lkctx, err := locker.GetLock(ctx, newDiskHealingTimeout) if err != nil { - return err + return fmt.Errorf("Healing of drive '%v' on %s pool, belonging to %s erasure set already in progress: %w", + disk, humanize.Ordinal(poolIdx+1), humanize.Ordinal(setIdx+1), err) } ctx = lkctx.Context() defer locker.Unlock(lkctx) @@ -325,19 +326,20 @@ func healFreshDisk(ctx context.Context, z *erasureServerPools, endpoint Endpoint // Load healing tracker in this disk tracker, err := loadHealingTracker(ctx, disk) if err != nil { - // A healing track can be not found when another disk in the same - // erasure set and same healing-id successfully finished healing. - if err == errFileNotFound { + // A healing tracker may be deleted if another disk in the + // same erasure set with same healing-id successfully finished + // healing. + if errors.Is(err, errFileNotFound) { return nil } - logger.LogIf(ctx, fmt.Errorf("Unable to load a healing tracker on '%s': %w", disk, err)) + logger.LogIf(ctx, fmt.Errorf("Unable to load healing tracker on '%s': %w, re-initializing..", disk, err)) tracker = newHealingTracker(disk, mustGetUUID()) } - logger.Info(fmt.Sprintf("Proceeding to heal '%s' - 'mc admin heal alias/ --verbose' to check the status.", endpoint)) + logger.Info(fmt.Sprintf("Healing drive '%s' - 'mc admin heal alias/ --verbose' to check the current status.", endpoint)) buckets, _ := z.ListBuckets(ctx, BucketOptions{}) - // Buckets data are dispersed in multiple zones/sets, make + // Buckets data are dispersed in multiple pools/sets, make // sure to heal all bucket metadata configuration. buckets = append(buckets, BucketInfo{ Name: pathJoin(minioMetaBucket, minioConfigPrefix), @@ -355,7 +357,7 @@ func healFreshDisk(ctx context.Context, z *erasureServerPools, endpoint Endpoint }) if serverDebugLog { - logger.Info("Healing drive '%v' on %s pool", disk, humanize.Ordinal(poolIdx+1)) + logger.Info("Healing drive '%v' on %s pool, belonging to %s erasure set", disk, humanize.Ordinal(poolIdx+1), humanize.Ordinal(setIdx+1)) } // Load bucket totals @@ -378,9 +380,9 @@ func healFreshDisk(ctx context.Context, z *erasureServerPools, endpoint Endpoint } if tracker.ItemsFailed > 0 { - logger.Info("Healing drive '%s' failed (healed: %d, failed: %d).", disk, tracker.ItemsHealed, tracker.ItemsFailed) + logger.Info("Healing of drive '%s' failed (healed: %d, failed: %d).", disk, tracker.ItemsHealed, tracker.ItemsFailed) } else { - logger.Info("Healing drive '%s' complete (healed: %d, failed: %d).", disk, tracker.ItemsHealed, tracker.ItemsFailed) + logger.Info("Healing of drive '%s' complete (healed: %d, failed: %d).", disk, tracker.ItemsHealed, tracker.ItemsFailed) } if len(tracker.QueuedBuckets) > 0 { @@ -392,7 +394,7 @@ func healFreshDisk(ctx context.Context, z *erasureServerPools, endpoint Endpoint logger.Info("\n") } - if tracker.HealID == "" { // HealID is empty only before Feb 2023 + if tracker.HealID == "" { // HealID was empty only before Feb 2023 logger.LogIf(ctx, tracker.delete(ctx)) return nil } @@ -401,7 +403,7 @@ func healFreshDisk(ctx context.Context, z *erasureServerPools, endpoint Endpoint for _, disk := range z.serverPools[poolIdx].sets[setIdx].getDisks() { t, err := loadHealingTracker(ctx, disk) if err != nil { - if err != errFileNotFound { + if !errors.Is(err, errFileNotFound) { logger.LogIf(ctx, err) } continue @@ -446,8 +448,7 @@ func monitorLocalDisksAndHeal(ctx context.Context, z *erasureServerPools) { for _, disk := range healDisks { go func(disk Endpoint) { globalBackgroundHealState.setDiskHealingStatus(disk, true) - err := healFreshDisk(ctx, z, disk) - if err != nil { + if err := healFreshDisk(ctx, z, disk); err != nil { globalBackgroundHealState.setDiskHealingStatus(disk, false) printEndpointError(disk, err, false) return diff --git a/cmd/global-heal.go b/cmd/global-heal.go index cac79b255..4ed70e148 100644 --- a/cmd/global-heal.go +++ b/cmd/global-heal.go @@ -269,7 +269,7 @@ func (er *erasureObjects) healErasureSet(ctx context.Context, buckets []string, }() // Note: updates from healEntry to tracker must be sent on results channel. - healEntry := func(entry metaCacheEntry) { + healEntry := func(bucket string, entry metaCacheEntry) { if entry.name == "" && len(entry.metadata) == 0 { // ignore entries that don't have metadata. return @@ -278,6 +278,7 @@ func (er *erasureObjects) healErasureSet(ctx context.Context, buckets []string, // ignore healing entry.name's with `/` suffix. return } + // We might land at .metacache, .trash, .multipart // no need to heal them skip, only when bucket // is '.minio.sys' @@ -302,6 +303,11 @@ func (er *erasureObjects) healErasureSet(ctx context.Context, buckets []string, versionID: "", }, madmin.HealItemObject) if err != nil { + if isErrObjectNotFound(err) { + // queueing happens across namespace, ignore + // objects that are not found. + return + } result = healEntryFailure(0) logger.LogIf(ctx, fmt.Errorf("unable to heal object %s/%s: %w", bucket, entry.name, err)) } else { @@ -317,12 +323,19 @@ func (er *erasureObjects) healErasureSet(ctx context.Context, buckets []string, return } + var versionNotFound int for _, version := range fivs.Versions { if err := bgSeq.queueHealTask(healSource{ bucket: bucket, object: version.Name, versionID: version.VersionID, }, madmin.HealItemObject); err != nil { + if isErrObjectNotFound(err) { + // queueing happens across namespace, ignore + // objects that are not found. + versionNotFound++ + continue + } // If not deleted, assume they failed. result = healEntryFailure(uint64(version.Size)) if version.VersionID != "" { @@ -341,6 +354,10 @@ func (er *erasureObjects) healErasureSet(ctx context.Context, buckets []string, case results <- result: } } + // All versions resulted in 'ObjectNotFound' + if versionNotFound == len(fivs.Versions) { + return + } select { case <-ctx.Done(): return @@ -351,22 +368,25 @@ func (er *erasureObjects) healErasureSet(ctx context.Context, buckets []string, waitForLowHTTPReq() } + actualBucket, prefix := path2BucketObject(bucket) + // How to resolve partial results. resolver := metadataResolutionParams{ dirQuorum: 1, objQuorum: 1, - bucket: bucket, + bucket: actualBucket, } err := listPathRaw(ctx, listPathRawOptions{ disks: disks, - bucket: bucket, + bucket: actualBucket, + path: prefix, recursive: true, forwardTo: forwardTo, minDisks: 1, reportNotFound: false, agreed: func(entry metaCacheEntry) { - healEntry(entry) + healEntry(actualBucket, entry) }, partial: func(entries metaCacheEntries, _ []error) { entry, ok := entries.resolve(&resolver) @@ -375,7 +395,7 @@ func (er *erasureObjects) healErasureSet(ctx context.Context, buckets []string, // proceed to heal nonetheless. entry, _ = entries.firstFound() } - healEntry(*entry) + healEntry(actualBucket, *entry) }, finished: nil, })