From 73e47cd8d5f46486fa9667cd9f089ee67c6a7483 Mon Sep 17 00:00:00 2001 From: Catherine Date: Tue, 5 May 2026 01:57:40 +0000 Subject: [PATCH] Significantly improve efficiency of tracing. I thought I was being smart by using a trie to record blob existence and sizes. I was not. The trie approach had at least ~5 times less throughput and consumed entirely unreasonable amounts of RAM. A hashmap works just fine here. --- go.mod | 1 - go.sum | 2 -- gomod2nix.toml | 3 --- src/garbage.go | 47 ++++++++++++++++++++++++----------------------- 4 files changed, 24 insertions(+), 29 deletions(-) diff --git a/go.mod b/go.mod index b9a4707..262437e 100644 --- a/go.mod +++ b/go.mod @@ -10,7 +10,6 @@ require ( github.com/bits-and-blooms/bloom/v3 v3.7.1 github.com/c2h5oh/datasize v0.0.0-20231215233829-aa82cc1e6500 github.com/creasty/defaults v1.8.0 - github.com/dghubble/trie v0.1.0 github.com/fatih/color v1.19.0 github.com/go-git/go-billy/v6 v6.0.0-20260410103409-85b6241850b5 github.com/go-git/go-git/v6 v6.0.0-alpha.2 diff --git a/go.sum b/go.sum index e9e3ec3..759b0ae 100644 --- a/go.sum +++ b/go.sum @@ -33,8 +33,6 @@ github.com/cyphar/filepath-securejoin v0.6.1/go.mod h1:A8hd4EnAeyujCJRrICiOWqjS1 github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= -github.com/dghubble/trie v0.1.0 h1:kJnjBLFFElBwS60N4tkPvnLhnpcDxbBjIulgI8CpNGM= -github.com/dghubble/trie v0.1.0/go.mod h1:sOmnzfBNH7H92ow2292dDFWNsVQuh/izuD7otCYb1ak= github.com/dustin/go-humanize v1.0.1 h1:GzkhY7T5VNhEkwH0PVJgjz+fX1rhBrR7pRT3mDkpeCY= github.com/dustin/go-humanize v1.0.1/go.mod h1:Mu1zIs6XwVuF/gI1OepvI0qD18qycQx+mFykh5fBlto= github.com/emirpasic/gods v1.18.1 h1:FXtiHYKDGKCW2KzwZKx0iC0PQmdlorYgdFG9jPXJ1Bc= diff --git a/gomod2nix.toml b/gomod2nix.toml index 861be76..c354fe6 100644 --- a/gomod2nix.toml +++ b/gomod2nix.toml @@ -46,9 +46,6 @@ schema = 3 [mod."github.com/davecgh/go-spew"] version = "v1.1.1" hash = "sha256-nhzSUrE1fCkN0+RL04N4h8jWmRFPPPWbCuDc7Ss0akI=" - [mod."github.com/dghubble/trie"] - version = "v0.1.0" - hash = "sha256-hVh7uYylpMCCSPcxl70hJTmzSwaA1MxBmJFBO5Xdncc=" [mod."github.com/dustin/go-humanize"] version = "v1.0.1" hash = "sha256-yuvxYYngpfVkUg9yAmG99IUVmADTQA0tMbBXe0Fq0Mc=" diff --git a/src/garbage.go b/src/garbage.go index 4dd0ca5..47335fd 100644 --- a/src/garbage.go +++ b/src/garbage.go @@ -5,30 +5,29 @@ import ( "fmt" "github.com/c2h5oh/datasize" - "github.com/dghubble/trie" ) -func trieReduce(data trie.Trier) (items, total int64) { - data.Walk(func(key string, value any) error { - items += 1 - total += *value.(*int64) - return nil - }) - return -} - func TraceGarbage(ctx context.Context) error { - allBlobs := trie.NewRuneTrie() - liveBlobs := trie.NewRuneTrie() + allBlobs := map[string]int64{} + liveBlobs := map[string]int64{} - traceManifest := func(manifestName string, manifest *Manifest) error { + reduceBlobs := func(data map[string]int64) (items, total int64) { + for _, value := range data { + items += 1 + total += value + } + return + } + + traceManifest := func(manifestKind string, manifestName string, manifest *Manifest) error { for _, entry := range manifest.GetContents() { if entry.GetType() == Type_ExternalFile { blobName := string(entry.Data) - if size := allBlobs.Get(blobName); size == nil { - return fmt.Errorf("%s: dangling reference %s", manifestName, blobName) + if size, ok := allBlobs[blobName]; ok { + liveBlobs[blobName] = size } else { - liveBlobs.Put(blobName, size) + logc.Printf(ctx, "trace manifest: %s/%s: dangling reference %s", + manifestKind, manifestName, blobName) } } } @@ -36,42 +35,44 @@ func TraceGarbage(ctx context.Context) error { } // Enumerate all blobs. + logc.Printf(ctx, "trace: enumerating blobs") for metadata, err := range backend.EnumerateBlobs(ctx) { if err != nil { return fmt.Errorf("trace blobs err: %w", err) } - allBlobs.Put(metadata.Name, &metadata.Size) + allBlobs[metadata.Name] = metadata.Size } // Enumerate blobs live via site manifests. + logc.Printf(ctx, "trace: enumerating manifests") for item, err := range backend.GetAllManifests(ctx) { metadata, manifest := item.Splat() if err != nil { return fmt.Errorf("trace sites err: %w", err) } - err = traceManifest(metadata.Name, manifest) + err = traceManifest("site", metadata.Name, manifest) if err != nil { return fmt.Errorf("trace sites err: %w", err) } } // Enumerate blobs live via audit records. - + logc.Printf(ctx, "trace: enumerating audit records") auditIDs := backend.SearchAuditLog(ctx, SearchAuditLogOptions{}) for record, err := range backend.GetAuditLogRecords(ctx, auditIDs) { if err != nil { - logc.Fatalln(ctx, err) + return fmt.Errorf("trace audit err: %w", err) } if record.Manifest != nil { - err = traceManifest(record.GetAuditID().String(), record.Manifest) + err = traceManifest("audit", record.GetAuditID().String(), record.Manifest) if err != nil { return fmt.Errorf("trace audit err: %w", err) } } } - allBlobsCount, allBlobsSize := trieReduce(allBlobs) - liveBlobsCount, liveBlobsSize := trieReduce(liveBlobs) + allBlobsCount, allBlobsSize := reduceBlobs(allBlobs) + liveBlobsCount, liveBlobsSize := reduceBlobs(liveBlobs) logc.Printf(ctx, "trace all: %d blobs, %s", allBlobsCount, datasize.ByteSize(allBlobsSize).HR()) logc.Printf(ctx, "trace live: %d blobs, %s",