From b49e37feeeb8b50fceb3b1975fcd3353061b0a9e Mon Sep 17 00:00:00 2001 From: Lai Jiang Date: Tue, 11 Mar 2025 10:14:11 -0400 Subject: [PATCH] Add a GCB job to delete GAE canary versions (#2714) We've seen this issue happen more often than not recently, where GAE canary deployment is stuck for about 10 min and the failed. The reason is not clear, but delete the canary version prior to a deployment always fixes the issue. --- release/cloudbuild-delete-canary.yaml | 46 +++++++++++++++++++++++++++ release/cloudbuild-release.yaml | 3 ++ 2 files changed, 49 insertions(+) create mode 100644 release/cloudbuild-delete-canary.yaml diff --git a/release/cloudbuild-delete-canary.yaml b/release/cloudbuild-delete-canary.yaml new file mode 100644 index 000000000..3589328f3 --- /dev/null +++ b/release/cloudbuild-delete-canary.yaml @@ -0,0 +1,46 @@ +# This will delete canary GAE versions named "nomulus". +# +# For reasons unknown, Spinnaker occasionally gets stuck when deploying to GAE +# canary, and the fix is to manually delete the canary versions before the +# deployment. +# +# To manually trigger a build on GCB, run: +# gcloud builds submit --config=cloudbuild-delete-canary.yaml \ +# --substitutions=_ENV=[ENV] .. +# +# To trigger a build automatically, follow the instructions below and add a trigger: +# https://cloud.google.com/cloud-build/docs/running-builds/automate-builds +# +steps: +# Pull the credential for nomulus tool. +- name: 'gcr.io/$PROJECT_ID/builder:latest' + entrypoint: /bin/bash + args: + - -c + - | + set -e + gcloud secrets versions access latest \ + --secret nomulus-tool-cloudbuild-credential > tool-credential.json +# Delete unused GAE versions. +- name: 'gcr.io/$PROJECT_ID/builder:latest' + entrypoint: /bin/bash + args: + - -c + - | + if [ ${_ENV} == production ] + then + project_id="domain-registry" + else + project_id="domain-registry-${_ENV}" + fi + + gcloud auth activate-service-account --key-file=tool-credential.json + + for service in default pubapi backend bsa tools console + do + gcloud app versions delete nomulus --service=$service \ + --project=$project_id --quiet; + done +timeout: 3600s +options: + machineType: 'N1_HIGHCPU_8' diff --git a/release/cloudbuild-release.yaml b/release/cloudbuild-release.yaml index 93975e112..3dc8e7071 100644 --- a/release/cloudbuild-release.yaml +++ b/release/cloudbuild-release.yaml @@ -88,6 +88,7 @@ steps: sed -i s/builder:latest/builder@$builder_digest/g release/cloudbuild-schema-deploy.yaml sed -i s/builder:latest/builder@$builder_digest/g release/cloudbuild-schema-verify.yaml sed -i s/builder:latest/builder@$builder_digest/g release/cloudbuild-delete.yaml + sed -i s/builder:latest/builder@$builder_digest/g release/cloudbuild-delete-canary.yaml sed -i s/builder:latest/builder@$builder_digest/g release/cloudbuild-restart-proxies.yaml sed -i s/GCP_PROJECT/${PROJECT_ID}/ proxy/kubernetes/proxy-*.yaml sed -i s/'$${TAG_NAME}'/${TAG_NAME}/g release/cloudbuild-sync-and-tag.yaml @@ -100,6 +101,8 @@ steps: > release/cloudbuild-deploy-gke-${environment}.yaml sed s/'$${_ENV}'/${environment}/g release/cloudbuild-delete.yaml \ > release/cloudbuild-delete-${environment}.yaml + sed s/'$${_ENV}'/${environment}/g release/cloudbuild-delete-canary.yaml \ + > release/cloudbuild-delete-canary-${environment}.yaml sed s/'$${_ENV}'/${environment}/g release/cloudbuild-restart-proxies.yaml \ > release/cloudbuild-restart-proxies-${environment}.yaml sed s/'$${_ENV}'/${environment}/g release/cloudbuild-restart-proxies.yaml | \