From 4c087278cb0fb7a0e9eeb3e025a3f4bbc23234dd Mon Sep 17 00:00:00 2001 From: Catherine Date: Tue, 23 Sep 2025 01:49:33 +0000 Subject: [PATCH] Fly.io: switch health check method to `[[services.http_checks]]`. More specifically, remove the dedicated HTTP datapath for health checks and verify the entire stack, from TLS frontend to S3 backend. Verifying too little has resulted in a small outage recently when the pages listener got misconfigured but the health listener happily accepted connections like normal. This would not happen now that the health check uses port 443, too. --- conf/Caddyfile | 16 ++++------------ fly.toml | 34 ++++++++++++++++++++++++---------- src/pages.go | 17 +++++++++++------ 3 files changed, 39 insertions(+), 28 deletions(-) diff --git a/conf/Caddyfile b/conf/Caddyfile index a417846..7d5b4af 100644 --- a/conf/Caddyfile +++ b/conf/Caddyfile @@ -57,11 +57,6 @@ protocols h3 } - servers :2002 { - name health - protocols h1 - } - servers :2019 { name metrics protocols h1 @@ -82,9 +77,10 @@ } http:// { - # initial PUT/POST for a new domain has to happen over HTTP - @get method GET - redir @get https://{host}{uri} 301 + # initial PUT/POST for a new domain has to happen over HTTP; + # health check also has to reach the backend over HTTP + @upgrade `method('GET') && !header({'Health-Check': '*'})` + redir @upgrade https://{host}{uri} 301 import backend } @@ -100,10 +96,6 @@ https:// { import backend } -http://localhost:2002 { - reverse_proxy http://localhost:3002 -} - http://:2019 { metrics } diff --git a/fly.toml b/fly.toml index 8129984..3c29dd7 100644 --- a/fly.toml +++ b/fly.toml @@ -43,6 +43,15 @@ proxy_proto_options = { version = "v2" } type = "connections" soft_limit = 250 +[[services.http_checks]] +protocol = "http" +method = "get" +path = "/" +headers = { Health-Check = "🩺", Host = "localhost" } +grace_period = "5s" +interval = "2s" +timeout = "1.5s" + # [::]:433/TCP; HTTP/1.1 and HTTP/2 [[services]] @@ -60,6 +69,20 @@ proxy_proto_options = { version = "v2" } type = "connections" soft_limit = 250 +[[services.http_checks]] +protocol = "https" +method = "get" +path = "/" +headers = { Health-Check = "🩺", Host = "localhost" } +grace_period = "5s" +interval = "2s" +timeout = "1.5s" +# At the moment there's no good way to handle this, so staging needs TLS keys from production +# for this one host that isn't used for anything other than full stack health checks. +# These can be copied over manually whenever they expire. +tls_skip_verify = false +tls_server_name = "git-pages.fly.dev" + # 0.0.0.0:433/UDP; HTTP/3 # (Fly.io does not support UDP on public IPv6!) @@ -70,16 +93,7 @@ ports = [{ port = 443 }] auto_stop_machines = "stop" auto_start_machines = true -# monitoring - -[checks.health] -type = "http" -method = "get" -port = 2002 -path = "/" -grace_period = "5s" -interval = "2s" -timeout = "1s" +# Metrics [metrics] port = 2019 diff --git a/src/pages.go b/src/pages.go index 0b464a7..e243ddf 100644 --- a/src/pages.go +++ b/src/pages.go @@ -444,12 +444,17 @@ func postPage(w http.ResponseWriter, r *http.Request) error { } func ServePages(w http.ResponseWriter, r *http.Request) { - log.Println("pages:", r.Method, r.Host, r.URL, r.Header.Get("Content-Type")) - if region := os.Getenv("FLY_REGION"); region != "" { - w.Header().Add("Server", - fmt.Sprintf("git-pages (fly.io; %s; %s)", region, os.Getenv("FLY_MACHINE_ID"))) - } else { - w.Header().Add("Server", "git-pages") + // We want upstream health checks to be done as closely to the normal flow as possible; + // any intentional deviation is an opportunity to miss an issue that will affect our + // visitors but not our health checks. + if r.Header.Get("Health-Check") == "" { + log.Println("pages:", r.Method, r.Host, r.URL, r.Header.Get("Content-Type")) + if region := os.Getenv("FLY_REGION"); region != "" { + w.Header().Add("Server", + fmt.Sprintf("git-pages (fly.io; %s; %s)", region, os.Getenv("FLY_MACHINE_ID"))) + } else { + w.Header().Add("Server", "git-pages") + } } err := error(nil) switch r.Method {