From dbfb5e797b18d1ebdbec95af16017cfe85727aae Mon Sep 17 00:00:00 2001 From: Klaus Post Date: Wed, 1 May 2024 08:18:21 -0700 Subject: [PATCH] Wait one minute after startup to restart decommissioning (#19645) Typically not all drives are connected, so we delay 3 minutes before resuming. This greatly reduces risk of starting to list unconnected drives, or drives we risk being disconnected soon. This delay is not applied when starting with an admin call. --- cmd/erasure-server-pool-decom.go | 4 ++++ cmd/utils.go | 18 +++++++----------- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/cmd/erasure-server-pool-decom.go b/cmd/erasure-server-pool-decom.go index 3f66dfc2e..946350af5 100644 --- a/cmd/erasure-server-pool-decom.go +++ b/cmd/erasure-server-pool-decom.go @@ -535,6 +535,10 @@ func (z *erasureServerPools) Init(ctx context.Context) error { if len(poolIndices) > 0 && globalEndpoints[poolIndices[0]].Endpoints[0].IsLocal { go func() { + // Resume decommissioning of pools, but wait 3 minutes for cluster to stabilize. + if err := sleepContext(ctx, 3*time.Minute); err != nil { + return + } r := rand.New(rand.NewSource(time.Now().UnixNano())) for { if err := z.Decommission(ctx, poolIndices...); err != nil { diff --git a/cmd/utils.go b/cmd/utils.go index 531272df6..1eacc3419 100644 --- a/cmd/utils.go +++ b/cmd/utils.go @@ -1127,16 +1127,12 @@ func ptr[T any](a T) *T { return &a } -func max(a, b int) int { - if a > b { - return a +// sleepContext sleeps for d duration or until ctx is done. +func sleepContext(ctx context.Context, d time.Duration) error { + select { + case <-ctx.Done(): + return ctx.Err() + case <-time.After(d): } - return b -} - -func min(a, b int) int { - if a < b { - return a - } - return b + return nil }