From 82aebb70bf8c93828fd1950848ea93e2fdf628fe Mon Sep 17 00:00:00 2001 From: Catherine Date: Sat, 6 Dec 2025 01:21:17 +0000 Subject: [PATCH] Add basic garbage tracer. This isn't a concurrent GC and it cannot provide a reliable result; the output is just an estimate. --- flake.nix | 2 +- go.mod | 1 + go.sum | 2 ++ src/backend_s3.go | 2 +- src/garbage.go | 87 +++++++++++++++++++++++++++++++++++++++++++++++ src/main.go | 87 ++++++++++++++++++++++++++--------------------- 6 files changed, 141 insertions(+), 40 deletions(-) create mode 100644 src/garbage.go diff --git a/flake.nix b/flake.nix index 528699b..f9b8b34 100644 --- a/flake.nix +++ b/flake.nix @@ -43,7 +43,7 @@ "-s -w" ]; - vendorHash = "sha256-D5v6LpJZ+a2Dzdir/YzyFBwY/K4laTr58beywzXOsTM="; + vendorHash = "sha256-wwsxHEwCySO2Ykttf6C+GZupMWczVYkAhSVwaVZHNko="; }; in { diff --git a/go.mod b/go.mod index 7d24959..fdff873 100644 --- a/go.mod +++ b/go.mod @@ -8,6 +8,7 @@ require ( github.com/KimMachineGun/automemlimit v0.7.5 github.com/c2h5oh/datasize v0.0.0-20231215233829-aa82cc1e6500 github.com/creasty/defaults v1.8.0 + github.com/dghubble/trie v0.1.0 github.com/fatih/color v1.18.0 github.com/getsentry/sentry-go v0.40.0 github.com/getsentry/sentry-go/slog v0.40.0 diff --git a/go.sum b/go.sum index 5bb3491..5c71e00 100644 --- a/go.sum +++ b/go.sum @@ -27,6 +27,8 @@ github.com/cyphar/filepath-securejoin v0.6.1/go.mod h1:A8hd4EnAeyujCJRrICiOWqjS1 github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/dghubble/trie v0.1.0 h1:kJnjBLFFElBwS60N4tkPvnLhnpcDxbBjIulgI8CpNGM= +github.com/dghubble/trie v0.1.0/go.mod h1:sOmnzfBNH7H92ow2292dDFWNsVQuh/izuD7otCYb1ak= github.com/dustin/go-humanize v1.0.1 h1:GzkhY7T5VNhEkwH0PVJgjz+fX1rhBrR7pRT3mDkpeCY= github.com/dustin/go-humanize v1.0.1/go.mod h1:Mu1zIs6XwVuF/gI1OepvI0qD18qycQx+mFykh5fBlto= github.com/elazarl/goproxy v1.7.2 h1:Y2o6urb7Eule09PjlhQRGNsqRfPmYI3KKQLFpCAV3+o= diff --git a/src/backend_s3.go b/src/backend_s3.go index d5f209a..129a890 100644 --- a/src/backend_s3.go +++ b/src/backend_s3.go @@ -780,7 +780,7 @@ func (s3 *S3Backend) SearchAuditLog( ctx context.Context, opts SearchAuditLogOptions, ) iter.Seq2[AuditID, error] { return func(yield func(AuditID, error) bool) { - logc.Printf(ctx, "s3: query audit\n") + logc.Printf(ctx, "s3: search audit\n") ctx, cancel := context.WithCancel(ctx) defer cancel() diff --git a/src/garbage.go b/src/garbage.go new file mode 100644 index 0000000..30c2a64 --- /dev/null +++ b/src/garbage.go @@ -0,0 +1,87 @@ +package git_pages + +import ( + "context" + "fmt" + + "github.com/c2h5oh/datasize" + "github.com/dghubble/trie" +) + +func trieReduce(data trie.Trier) (items, total int64) { + data.Walk(func(key string, value any) error { + items += 1 + total += *value.(*int64) + return nil + }) + return +} + +func TraceGarbage(ctx context.Context) error { + allBlobs := trie.NewRuneTrie() + liveBlobs := trie.NewRuneTrie() + + traceManifest := func(manifestName string, manifest *Manifest) error { + for _, entry := range manifest.GetContents() { + if entry.GetType() == Type_ExternalFile { + blobName := string(entry.Data) + if size := allBlobs.Get(blobName); size == nil { + return fmt.Errorf("%s: dangling reference %s", manifestName, blobName) + } else { + liveBlobs.Put(blobName, size) + } + } + } + return nil + } + + // Enumerate all blobs. + for metadata, err := range backend.EnumerateBlobs(ctx) { + if err != nil { + return fmt.Errorf("trace blobs err: %w", err) + } + allBlobs.Put(metadata.Name, &metadata.Size) + } + + // Enumerate blobs live via site manifests. + for metadata, err := range backend.EnumerateManifests(ctx) { + if err != nil { + return fmt.Errorf("trace sites err: %w", err) + } + manifest, _, err := backend.GetManifest(ctx, metadata.Name, GetManifestOptions{}) + if err != nil { + return fmt.Errorf("trace sites err: %w", err) + } + err = traceManifest(metadata.Name, manifest) + if err != nil { + return fmt.Errorf("trace sites err: %w", err) + } + } + + // Enumerate blobs live via audit records. + for auditID, err := range backend.SearchAuditLog(ctx, SearchAuditLogOptions{}) { + if err != nil { + return fmt.Errorf("trace audit err: %w", err) + } + auditRecord, err := backend.QueryAuditLog(ctx, auditID) + if err != nil { + return fmt.Errorf("trace audit err: %w", err) + } + if auditRecord.Manifest != nil { + err = traceManifest(auditID.String(), auditRecord.Manifest) + if err != nil { + return fmt.Errorf("trace audit err: %w", err) + } + } + } + + allBlobsCount, allBlobsSize := trieReduce(allBlobs) + logc.Printf(ctx, "trace all: %d blobs, %s", + allBlobsCount, datasize.ByteSize(allBlobsSize).HR()) + + liveBlobsCount, liveBlobsSize := trieReduce(liveBlobs) + logc.Printf(ctx, "trace live: %d blobs, %s", + liveBlobsCount, datasize.ByteSize(liveBlobsSize).HR()) + + return nil +} diff --git a/src/main.go b/src/main.go index c0cf511..ca3553a 100644 --- a/src/main.go +++ b/src/main.go @@ -170,16 +170,18 @@ func usage() { fmt.Fprintf(os.Stderr, "Usage:\n") fmt.Fprintf(os.Stderr, "(server) "+ "git-pages [-config |-no-config]\n") + fmt.Fprintf(os.Stderr, "(info) "+ + "git-pages {-print-config-env-vars|-print-config}\n") fmt.Fprintf(os.Stderr, "(debug) "+ "git-pages {-list-blobs|-list-manifests}\n") fmt.Fprintf(os.Stderr, "(debug) "+ "git-pages {-get-blob|-get-manifest|-get-archive|-update-site} [file]\n") fmt.Fprintf(os.Stderr, "(admin) "+ - "git-pages {-run-migration |-freeze-domain |-unfreeze-domain }\n") + "git-pages {-freeze-domain |-unfreeze-domain }\n") fmt.Fprintf(os.Stderr, "(audit) "+ "git-pages {-audit-log|-audit-read |-audit-server [args...]}\n") - fmt.Fprintf(os.Stderr, "(info) "+ - "git-pages {-print-config-env-vars|-print-config}\n") + fmt.Fprintf(os.Stderr, "(maint) "+ + "git-pages {-run-migration |-trace-garbage}\n") flag.PrintDefaults() } @@ -187,24 +189,22 @@ func Main() { ctx := context.Background() flag.Usage = usage - printConfigEnvVars := flag.Bool("print-config-env-vars", false, - "print every recognized configuration environment variable and exit") - printConfig := flag.Bool("print-config", false, - "print configuration as JSON and exit") configTomlPath := flag.String("config", "", "load configuration from `filename` (default: 'config.toml')") noConfig := flag.Bool("no-config", false, "run without configuration file (configure via environment variables)") - runMigration := flag.String("run-migration", "", - "run a store `migration` (one of: create-domain-markers)") - getBlob := flag.String("get-blob", "", - "write contents of `blob` ('sha256-xxxxxxx...xxx')") + printConfigEnvVars := flag.Bool("print-config-env-vars", false, + "print every recognized configuration environment variable and exit") + printConfig := flag.Bool("print-config", false, + "print configuration as JSON and exit") listBlobs := flag.Bool("list-blobs", false, "enumerate every blob with its metadata") - getManifest := flag.String("get-manifest", "", - "write manifest for `site` (either 'domain.tld' or 'domain.tld/dir') as ProtoJSON") listManifests := flag.Bool("list-manifests", false, "enumerate every manifest with its metadata") + getBlob := flag.String("get-blob", "", + "write contents of `blob` ('sha256-xxxxxxx...xxx')") + getManifest := flag.String("get-manifest", "", + "write manifest for `site` (either 'domain.tld' or 'domain.tld/dir') as ProtoJSON") getArchive := flag.String("get-archive", "", "write archive for `site` (either 'domain.tld' or 'domain.tld/dir') in tar format") updateSite := flag.String("update-site", "", @@ -219,15 +219,18 @@ func Main() { "extract contents of audit record `id` to files '-*'") auditServer := flag.String("audit-server", "", "listen for notifications on `endpoint` and spawn a process for each audit event") + runMigration := flag.String("run-migration", "", + "run a store `migration` (one of: create-domain-markers)") + traceGarbage := flag.Bool("trace-garbage", false, + "estimate total size of unreachable blobs") flag.Parse() var cliOperations int for _, selected := range []bool{ - *runMigration != "", - *getBlob != "", *listBlobs, - *getManifest != "", *listManifests, + *getBlob != "", + *getManifest != "", *getArchive != "", *updateSite != "", *freezeDomain != "", @@ -235,14 +238,17 @@ func Main() { *auditLog, *auditRead != "", *auditServer != "", + *runMigration != "", + *traceGarbage, } { if selected { cliOperations++ } } if cliOperations > 1 { - logc.Fatalln(ctx, "-get-blob, -get-manifest, -get-archive, -update-site, "+ - "-freeze, -unfreeze, -audit-log, and -audit-read are mutually exclusive") + logc.Fatalln(ctx, "-list-blobs, -list-manifests, -get-blob, -get-manifest, -get-archive, "+ + "-update-site, -freeze-domain, -unfreeze-domain, -audit-log, -audit-read, "+ + "-audit-server, -run-migration, and -trace-garbage are mutually exclusive") } if *configTomlPath != "" && *noConfig { @@ -288,18 +294,6 @@ func Main() { } switch { - case *runMigration != "": - if err := RunMigration(ctx, *runMigration); err != nil { - logc.Fatalln(ctx, err) - } - - case *getBlob != "": - reader, _, err := backend.GetBlob(ctx, *getBlob) - if err != nil { - logc.Fatalln(ctx, err) - } - io.Copy(fileOutputArg(), reader) - case *listBlobs: for metadata, err := range backend.EnumerateBlobs(ctx) { if err != nil { @@ -312,14 +306,6 @@ func Main() { ) } - case *getManifest != "": - webRoot := webRootArg(*getManifest) - manifest, _, err := backend.GetManifest(ctx, webRoot, GetManifestOptions{}) - if err != nil { - logc.Fatalln(ctx, err) - } - fmt.Fprintln(fileOutputArg(), string(ManifestJSON(manifest))) - case *listManifests: for metadata, err := range backend.EnumerateManifests(ctx) { if err != nil { @@ -332,6 +318,21 @@ func Main() { ) } + case *getBlob != "": + reader, _, err := backend.GetBlob(ctx, *getBlob) + if err != nil { + logc.Fatalln(ctx, err) + } + io.Copy(fileOutputArg(), reader) + + case *getManifest != "": + webRoot := webRootArg(*getManifest) + manifest, _, err := backend.GetManifest(ctx, webRoot, GetManifestOptions{}) + if err != nil { + logc.Fatalln(ctx, err) + } + fmt.Fprintln(fileOutputArg(), string(ManifestJSON(manifest))) + case *getArchive != "": webRoot := webRootArg(*getArchive) manifest, metadata, err := @@ -491,6 +492,16 @@ func Main() { serve(ctx, listen(ctx, "audit", *auditServer), ObserveHTTPHandler(processor)) + case *runMigration != "": + if err = RunMigration(ctx, *runMigration); err != nil { + logc.Fatalln(ctx, err) + } + + case *traceGarbage: + if err = TraceGarbage(ctx); err != nil { + logc.Fatalln(ctx, err) + } + default: // Hook a signal (SIGHUP on *nix, nothing on Windows) for reloading the configuration // at runtime. This is useful because it preserves S3 backend cache contents. Failed