Greenfield Go multi-tenant IPFS Pinning Service wire-compatible with the
IPFS Pinning Services API spec. Paired 1:1 with Kubo over localhost RPC,
clustered via embedded NATS JetStream, Postgres source-of-truth with
RLS-enforced tenancy, Fiber + huma v2 for the HTTP surface, Authentik
OIDC for session login with kid-rotated HS256 JWT API tokens.
Feature-complete against the 22-milestone build plan, including the
ship-it v1.0 gap items:
* admin CLIs: drain/uncordon, maintenance, mint-token, rotate-key,
prune-denylist, rebalance --dry-run, cache-stats, cluster-presences
* TTL leader election via NATS KV, fence tokens, JetStream dedup
* rebalancer (plan/apply split), reconciler, requeue sweeper
* ristretto caches with NATS-backed cross-node invalidation
(placements live-nodes + token denylist)
* maintenance watchdog for stuck cluster-pause flag
* Prometheus /metrics with CIDR ACL, HTTP/pin/scheduler/cache gauges
* rate limiting: session (10/min) + anonymous global (120/min)
* integration tests: rebalance, refcount multi-org, RLS belt
* goreleaser (tar + deb/rpm/apk + Alpine Docker) targeting Gitea
Stack: Cobra/Viper, Fiber v2 + huma v2, embedded NATS JetStream,
pgx/sqlc/golang-migrate, ristretto, TypeID, prometheus/client_golang,
testcontainers-go.
97 lines
3.0 KiB
Go
97 lines
3.0 KiB
Go
package metrics
|
|
|
|
import (
|
|
"context"
|
|
"log/slog"
|
|
"time"
|
|
|
|
"anchorage/internal/pkg/cache"
|
|
"anchorage/internal/pkg/store"
|
|
)
|
|
|
|
// RefreshGauges periodically snapshots slow-moving values from the
|
|
// store + cache registry into Prometheus gauges. The goroutine runs
|
|
// until ctx is cancelled; ticker cadence is fixed at 15 seconds —
|
|
// Prometheus scrapers typically poll every 15-60s and seeing stale-by-15s
|
|
// gauges is an acceptable trade-off for avoiding hot query paths.
|
|
//
|
|
// Collectors populated:
|
|
//
|
|
// - NodesLive: count of nodes with status='up'
|
|
// - PlacementsByStatus: one sample per status label
|
|
// - CacheHits / CacheMisses: monotonic counters mirrored from
|
|
// cache.AllStats() (both our ristretto-backed Cache[K,V] and the
|
|
// hand-rolled pin.CachedLiveNodes register as StatsProviders)
|
|
//
|
|
// The function is a no-op when s is nil (useful in tests and in
|
|
// dev-mode runs with incomplete wiring).
|
|
func RefreshGauges(ctx context.Context, s store.Store) {
|
|
if s == nil {
|
|
return
|
|
}
|
|
// Tick immediately so gauges aren't zero for the first 15s after
|
|
// startup.
|
|
tick(ctx, s)
|
|
|
|
t := time.NewTicker(15 * time.Second)
|
|
defer t.Stop()
|
|
for {
|
|
select {
|
|
case <-ctx.Done():
|
|
return
|
|
case <-t.C:
|
|
tick(ctx, s)
|
|
}
|
|
}
|
|
}
|
|
|
|
// tick is one refresh pass. Keeps errors non-fatal — a failing query
|
|
// leaves the last good value in place, which is the least-surprising
|
|
// behavior for a metrics endpoint.
|
|
func tick(ctx context.Context, s store.Store) {
|
|
if live, err := s.Nodes().ListLive(ctx); err == nil {
|
|
NodesLive.Set(float64(len(live)))
|
|
} else {
|
|
slog.Warn("metrics: refresh NodesLive", "err", err)
|
|
}
|
|
|
|
if counts, err := s.Pins().CountPlacementsByStatus(ctx); err == nil {
|
|
// Reset the vector first so statuses that dropped to 0 stop
|
|
// reporting stale non-zero values.
|
|
PlacementsByStatus.Reset()
|
|
for status, n := range counts {
|
|
PlacementsByStatus.WithLabelValues(status).Set(float64(n))
|
|
}
|
|
} else {
|
|
slog.Warn("metrics: refresh PlacementsByStatus", "err", err)
|
|
}
|
|
|
|
// Mirror cache counters. These are monotonic in ristretto; we
|
|
// re-Set them (the Prometheus counter API demands Add, but we
|
|
// registered these as CounterVec — using Add on a monotonic delta
|
|
// would double-count across ticks).
|
|
//
|
|
// Workaround: track previous totals and only Add the delta since
|
|
// last tick. Module-level state is fine because RefreshGauges runs
|
|
// as a single goroutine.
|
|
for _, s := range cache.AllStats() {
|
|
observeCacheDelta(s)
|
|
}
|
|
}
|
|
|
|
// prevCacheStats tracks per-name counter values we've already pushed
|
|
// so subsequent ticks only Add the delta. Single-writer goroutine
|
|
// means no locking required.
|
|
var prevCacheStats = map[string]cache.Stats{}
|
|
|
|
func observeCacheDelta(current cache.Stats) {
|
|
prev := prevCacheStats[current.Name]
|
|
if dh := current.Hits - prev.Hits; dh > 0 {
|
|
CacheHits.WithLabelValues(current.Name).Add(float64(dh))
|
|
}
|
|
if dm := current.Misses - prev.Misses; dm > 0 {
|
|
CacheMisses.WithLabelValues(current.Name).Add(float64(dm))
|
|
}
|
|
prevCacheStats[current.Name] = current
|
|
}
|