From b737c83a66812ba60a5bbe3ee5ddd12dd0e32255 Mon Sep 17 00:00:00 2001 From: Anis Elleuch Date: Wed, 24 Aug 2022 21:46:09 +0100 Subject: [PATCH] Ensure that only one node performs site replication healing (#15584) When a node finds a change in the other replication cluster and applies to itself will already notify other peers. No need for all nodes in a given cluster to do site replication healing, only one node is sufficient. --- cmd/site-replication.go | 31 +++++++++++++++++++ .../run-multi-site-minio-idp.sh | 2 +- 2 files changed, 32 insertions(+), 1 deletion(-) diff --git a/cmd/site-replication.go b/cmd/site-replication.go index ab82d65c2..1066e3960 100644 --- a/cmd/site-replication.go +++ b/cmd/site-replication.go @@ -25,6 +25,7 @@ import ( "encoding/xml" "errors" "fmt" + "math/rand" "net/url" "reflect" "sort" @@ -3501,7 +3502,37 @@ func (c *SiteReplicationSys) PeerEditReq(ctx context.Context, arg madmin.PeerInf const siteHealTimeInterval = 10 * time.Second +var siteReplicationHealLockTimeout = newDynamicTimeoutWithOpts(dynamicTimeoutOpts{ + timeout: 30 * time.Second, + minimum: 10 * time.Second, + retryInterval: time.Second, +}) + func (c *SiteReplicationSys) startHealRoutine(ctx context.Context, objAPI ObjectLayer) { + r := rand.New(rand.NewSource(time.Now().UnixNano())) + // Run the site replication healing in a loop + for { + c.healRoutine(ctx, objAPI) + duration := time.Duration(r.Float64() * float64(time.Minute)) + if duration < time.Second { + // Make sure to sleep atleast a second to avoid high CPU ticks. + duration = time.Second + } + time.Sleep(duration) + } +} + +func (c *SiteReplicationSys) healRoutine(ctx context.Context, objAPI ObjectLayer) { + // Make sure only one node running site replication on the cluster. + locker := objAPI.NewNSLock(minioMetaBucket, "site-replication/heal.lock") + lkctx, err := locker.GetLock(ctx, siteReplicationHealLockTimeout) + if err != nil { + return + } + ctx = lkctx.Context() + defer lkctx.Cancel() + // No unlock for "leader" lock. + healTimer := time.NewTimer(siteHealTimeInterval) defer healTimer.Stop() diff --git a/docs/site-replication/run-multi-site-minio-idp.sh b/docs/site-replication/run-multi-site-minio-idp.sh index 9581a7b18..aca0dcc29 100755 --- a/docs/site-replication/run-multi-site-minio-idp.sh +++ b/docs/site-replication/run-multi-site-minio-idp.sh @@ -335,7 +335,7 @@ kill -9 ${site1_pid} ./mc rb minio2/bucket2 # Restart minio1 instance minio server --config-dir /tmp/minio-internal --address ":9001" /tmp/minio-internal-idp1/{1...4} >/tmp/minio1_1.log 2>&1 & -sleep 30 +sleep 40 # Test whether most recent tag update on minio2 is replicated to minio1 val=$(./mc tag list minio1/newbucket --json | jq -r .tagset | jq -r .key )