Files
anchorage/internal/pkg/metrics/refresh.go
William Gill 12bf35caf8 anchorage v1.0 initial tree
Greenfield Go multi-tenant IPFS Pinning Service wire-compatible with the
IPFS Pinning Services API spec. Paired 1:1 with Kubo over localhost RPC,
clustered via embedded NATS JetStream, Postgres source-of-truth with
RLS-enforced tenancy, Fiber + huma v2 for the HTTP surface, Authentik
OIDC for session login with kid-rotated HS256 JWT API tokens.

Feature-complete against the 22-milestone build plan, including the
ship-it v1.0 gap items:

  * admin CLIs: drain/uncordon, maintenance, mint-token, rotate-key,
    prune-denylist, rebalance --dry-run, cache-stats, cluster-presences
  * TTL leader election via NATS KV, fence tokens, JetStream dedup
  * rebalancer (plan/apply split), reconciler, requeue sweeper
  * ristretto caches with NATS-backed cross-node invalidation
    (placements live-nodes + token denylist)
  * maintenance watchdog for stuck cluster-pause flag
  * Prometheus /metrics with CIDR ACL, HTTP/pin/scheduler/cache gauges
  * rate limiting: session (10/min) + anonymous global (120/min)
  * integration tests: rebalance, refcount multi-org, RLS belt
  * goreleaser (tar + deb/rpm/apk + Alpine Docker) targeting Gitea

Stack: Cobra/Viper, Fiber v2 + huma v2, embedded NATS JetStream,
pgx/sqlc/golang-migrate, ristretto, TypeID, prometheus/client_golang,
testcontainers-go.
2026-04-16 18:13:36 -05:00

97 lines
3.0 KiB
Go

package metrics
import (
"context"
"log/slog"
"time"
"anchorage/internal/pkg/cache"
"anchorage/internal/pkg/store"
)
// RefreshGauges periodically snapshots slow-moving values from the
// store + cache registry into Prometheus gauges. The goroutine runs
// until ctx is cancelled; ticker cadence is fixed at 15 seconds —
// Prometheus scrapers typically poll every 15-60s and seeing stale-by-15s
// gauges is an acceptable trade-off for avoiding hot query paths.
//
// Collectors populated:
//
// - NodesLive: count of nodes with status='up'
// - PlacementsByStatus: one sample per status label
// - CacheHits / CacheMisses: monotonic counters mirrored from
// cache.AllStats() (both our ristretto-backed Cache[K,V] and the
// hand-rolled pin.CachedLiveNodes register as StatsProviders)
//
// The function is a no-op when s is nil (useful in tests and in
// dev-mode runs with incomplete wiring).
func RefreshGauges(ctx context.Context, s store.Store) {
if s == nil {
return
}
// Tick immediately so gauges aren't zero for the first 15s after
// startup.
tick(ctx, s)
t := time.NewTicker(15 * time.Second)
defer t.Stop()
for {
select {
case <-ctx.Done():
return
case <-t.C:
tick(ctx, s)
}
}
}
// tick is one refresh pass. Keeps errors non-fatal — a failing query
// leaves the last good value in place, which is the least-surprising
// behavior for a metrics endpoint.
func tick(ctx context.Context, s store.Store) {
if live, err := s.Nodes().ListLive(ctx); err == nil {
NodesLive.Set(float64(len(live)))
} else {
slog.Warn("metrics: refresh NodesLive", "err", err)
}
if counts, err := s.Pins().CountPlacementsByStatus(ctx); err == nil {
// Reset the vector first so statuses that dropped to 0 stop
// reporting stale non-zero values.
PlacementsByStatus.Reset()
for status, n := range counts {
PlacementsByStatus.WithLabelValues(status).Set(float64(n))
}
} else {
slog.Warn("metrics: refresh PlacementsByStatus", "err", err)
}
// Mirror cache counters. These are monotonic in ristretto; we
// re-Set them (the Prometheus counter API demands Add, but we
// registered these as CounterVec — using Add on a monotonic delta
// would double-count across ticks).
//
// Workaround: track previous totals and only Add the delta since
// last tick. Module-level state is fine because RefreshGauges runs
// as a single goroutine.
for _, s := range cache.AllStats() {
observeCacheDelta(s)
}
}
// prevCacheStats tracks per-name counter values we've already pushed
// so subsequent ticks only Add the delta. Single-writer goroutine
// means no locking required.
var prevCacheStats = map[string]cache.Stats{}
func observeCacheDelta(current cache.Stats) {
prev := prevCacheStats[current.Name]
if dh := current.Hits - prev.Hits; dh > 0 {
CacheHits.WithLabelValues(current.Name).Add(float64(dh))
}
if dm := current.Misses - prev.Misses; dm > 0 {
CacheMisses.WithLabelValues(current.Name).Add(float64(dm))
}
prevCacheStats[current.Name] = current
}