Fly.io: switch health check method to [[services.http_checks]].

More specifically, remove the dedicated HTTP datapath for health
checks and verify the entire stack, from TLS frontend to S3 backend.
Verifying too little has resulted in a small outage recently when
the pages listener got misconfigured but the health listener happily
accepted connections like normal. This would not happen now that
the health check uses port 443, too.
This commit is contained in:
Catherine
2025-09-23 01:49:33 +00:00
parent 922cc6315a
commit 4c087278cb
3 changed files with 39 additions and 28 deletions

View File

@@ -57,11 +57,6 @@
protocols h3
}
servers :2002 {
name health
protocols h1
}
servers :2019 {
name metrics
protocols h1
@@ -82,9 +77,10 @@
}
http:// {
# initial PUT/POST for a new domain has to happen over HTTP
@get method GET
redir @get https://{host}{uri} 301
# initial PUT/POST for a new domain has to happen over HTTP;
# health check also has to reach the backend over HTTP
@upgrade `method('GET') && !header({'Health-Check': '*'})`
redir @upgrade https://{host}{uri} 301
import backend
}
@@ -100,10 +96,6 @@ https:// {
import backend
}
http://localhost:2002 {
reverse_proxy http://localhost:3002
}
http://:2019 {
metrics
}

View File

@@ -43,6 +43,15 @@ proxy_proto_options = { version = "v2" }
type = "connections"
soft_limit = 250
[[services.http_checks]]
protocol = "http"
method = "get"
path = "/"
headers = { Health-Check = "🩺", Host = "localhost" }
grace_period = "5s"
interval = "2s"
timeout = "1.5s"
# [::]:433/TCP; HTTP/1.1 and HTTP/2
[[services]]
@@ -60,6 +69,20 @@ proxy_proto_options = { version = "v2" }
type = "connections"
soft_limit = 250
[[services.http_checks]]
protocol = "https"
method = "get"
path = "/"
headers = { Health-Check = "🩺", Host = "localhost" }
grace_period = "5s"
interval = "2s"
timeout = "1.5s"
# At the moment there's no good way to handle this, so staging needs TLS keys from production
# for this one host that isn't used for anything other than full stack health checks.
# These can be copied over manually whenever they expire.
tls_skip_verify = false
tls_server_name = "git-pages.fly.dev"
# 0.0.0.0:433/UDP; HTTP/3
# (Fly.io does not support UDP on public IPv6!)
@@ -70,16 +93,7 @@ ports = [{ port = 443 }]
auto_stop_machines = "stop"
auto_start_machines = true
# monitoring
[checks.health]
type = "http"
method = "get"
port = 2002
path = "/"
grace_period = "5s"
interval = "2s"
timeout = "1s"
# Metrics
[metrics]
port = 2019

View File

@@ -444,12 +444,17 @@ func postPage(w http.ResponseWriter, r *http.Request) error {
}
func ServePages(w http.ResponseWriter, r *http.Request) {
log.Println("pages:", r.Method, r.Host, r.URL, r.Header.Get("Content-Type"))
if region := os.Getenv("FLY_REGION"); region != "" {
w.Header().Add("Server",
fmt.Sprintf("git-pages (fly.io; %s; %s)", region, os.Getenv("FLY_MACHINE_ID")))
} else {
w.Header().Add("Server", "git-pages")
// We want upstream health checks to be done as closely to the normal flow as possible;
// any intentional deviation is an opportunity to miss an issue that will affect our
// visitors but not our health checks.
if r.Header.Get("Health-Check") == "" {
log.Println("pages:", r.Method, r.Host, r.URL, r.Header.Get("Content-Type"))
if region := os.Getenv("FLY_REGION"); region != "" {
w.Header().Add("Server",
fmt.Sprintf("git-pages (fly.io; %s; %s)", region, os.Getenv("FLY_MACHINE_ID")))
} else {
w.Header().Add("Server", "git-pages")
}
}
err := error(nil)
switch r.Method {