[breaking-change] Replace -size-histogram with -analyze-storage.

The old function did not even draw a histogram (it was a bar chart),
and would essentially always overcount sizes.

The new function is always accurate and just as useful at a glance.
It provides two modes, `text` (optionally colorized) and `json`.
This commit is contained in:
Catherine
2026-05-29 07:10:01 +00:00
parent 7e72765dd1
commit 43088db596
3 changed files with 172 additions and 74 deletions
+106
View File
@@ -0,0 +1,106 @@
package git_pages
import (
"context"
"fmt"
"strings"
)
type StorageSize struct {
Domain string `json:"domain"`
// Size of live storage for the current versions of sites.
CurrentSize int64 `json:"currentSize"`
// Size of live blobs (only) for the current version of sites.
CurrentBlobSize int64 `json:"-"`
// Size of live storage for the non-current versions of sites in audit records.
NonCurrentSize int64 `json:"nonCurrentSize"`
// Total size of live storage for this domain.
TotalSize int64 `json:"totalSize"`
}
func AnalyzeStorage(ctx context.Context) ([]*StorageSize, error) {
type storageData struct {
siteManifests int64
siteBlobs map[string]int64
auditRecords int64
auditBlobs map[string]int64
}
thickStats := map[string]*storageData{}
thinStats := []*StorageSize{}
getStats := func(domain string) *storageData {
if _, found := thickStats[domain]; !found {
thickStats[domain] = &storageData{
siteBlobs: map[string]int64{},
auditBlobs: map[string]int64{},
}
}
return thickStats[domain]
}
totalStats := getStats("*")
logc.Printf(ctx, "analyze: enumerating manifests")
for item, err := range backend.GetAllManifests(ctx) {
metadata, manifest := item.Splat()
if err != nil {
return nil, fmt.Errorf("analyze err: %w", err)
}
domain, _, _ := strings.Cut(metadata.Name, "/")
stats := getStats(domain)
stats.siteManifests += metadata.Size
totalStats.siteManifests += metadata.Size
for _, entry := range manifest.GetContents() {
if entry.GetType() == Type_ExternalFile {
blobName, blobSize := string(entry.Data), entry.GetCompressedSize()
stats.siteBlobs[blobName] = blobSize
totalStats.siteBlobs[blobName] = blobSize
}
}
}
logc.Printf(ctx, "analyze: enumerating audit records")
auditIDs := backend.SearchAuditLog(ctx, SearchAuditLogOptions{})
for record, err := range backend.GetAuditLogRecords(ctx, auditIDs) {
if err != nil {
return nil, fmt.Errorf("analyze err: %w", err)
}
domain := record.GetDomain()
stats := getStats(domain)
recordSize := int64(len(EncodeAuditRecord(record)))
stats.auditRecords += recordSize
totalStats.auditRecords += recordSize
if record.Manifest == nil || record.IsDetached() {
continue
}
for _, entry := range record.Manifest.GetContents() {
if entry.GetType() == Type_ExternalFile {
blobName, blobSize := string(entry.Data), entry.GetCompressedSize()
if _, found := stats.siteBlobs[blobName]; found {
continue // already accounted for
}
stats.auditBlobs[blobName] = entry.GetCompressedSize()
totalStats.auditBlobs[blobName] = blobSize
}
}
}
// Now aggregate the information.
for domain, stats := range thickStats {
sizes := StorageSize{Domain: domain}
sizes.CurrentSize += stats.siteManifests
for _, size := range stats.siteBlobs {
sizes.CurrentSize += size
sizes.CurrentBlobSize += size
}
sizes.NonCurrentSize += stats.auditRecords
for _, size := range stats.auditBlobs {
sizes.NonCurrentSize += size
}
sizes.TotalSize = sizes.CurrentSize + sizes.NonCurrentSize
thinStats = append(thinStats, &sizes)
}
return thinStats, nil
}
-35
View File
@@ -1,35 +0,0 @@
package git_pages
import (
"context"
"fmt"
"maps"
"slices"
"strings"
)
type DomainStatistics struct {
Domain string
OriginalSize int64
CompressedSize int64
StoredSize int64
}
func SizeHistogram(ctx context.Context) ([]*DomainStatistics, error) {
statisticsMap := map[string]*DomainStatistics{}
for item, err := range backend.GetAllManifests(ctx) {
metadata, manifest := item.Splat()
if err != nil {
return nil, fmt.Errorf("size histogram err: %w", err)
}
domain, _, _ := strings.Cut(metadata.Name, "/")
if _, found := statisticsMap[domain]; !found {
statisticsMap[domain] = &DomainStatistics{Domain: domain}
}
statistics := statisticsMap[domain]
statistics.OriginalSize += metadata.Size + manifest.GetOriginalSize()
statistics.CompressedSize += metadata.Size + manifest.GetCompressedSize()
statistics.StoredSize += metadata.Size + manifest.GetStoredSize()
}
return slices.Collect(maps.Values(statisticsMap)), nil
}
+66 -39
View File
@@ -4,6 +4,7 @@ import (
"cmp"
"context"
"crypto/tls"
"encoding/json"
"errors"
"flag"
"fmt"
@@ -219,7 +220,7 @@ func usage() {
fmt.Fprintf(os.Stderr, "(maint) "+
"git-pages -site-expire [-dry-run]\n")
fmt.Fprintf(os.Stderr, "(maint) "+
"git-pages {-run-migration <name>|-trace-garbage|-size-histogram {original|stored}}\n")
"git-pages {-run-migration <name>|-trace-garbage|-analyze-storage}\n")
flag.PrintDefaults()
}
@@ -269,8 +270,8 @@ func Main(versionInfo string) {
"expire sites according to their manifest")
runMigration := flag.String("run-migration", "",
"run a store `migration` (one of: create-domain-markers)")
sizeHistogram := flag.String("size-histogram", "",
"display histogram of `size-type` (original or stored) per domain")
analyzeStorage := flag.String("analyze-storage", "",
"display aggregate storage used per domain")
traceGarbage := flag.Bool("trace-garbage", false,
"estimate total size of unreachable blobs")
dryRun := flag.Bool("dry-run", false,
@@ -302,7 +303,7 @@ func Main(versionInfo string) {
*auditServer != "",
*siteExpire,
*runMigration != "",
*sizeHistogram != "",
*analyzeStorage != "",
*traceGarbage,
} {
if selected {
@@ -313,7 +314,7 @@ func Main(versionInfo string) {
logc.Fatalln(ctx, "-list-blobs, -list-manifests, -get-blob, -get-manifest, -get-archive, "+
"-update-site, -freeze-domain, -unfreeze-domain, -audit-log, -audit-read, "+
"-audit-rollback, -audit-expire, -audit-detach, -audit-server, -site-expire, "+
"-run-migration, -size-histogram, and -trace-garbage are mutually exclusive")
"-run-migration, -analyze-storage, and -trace-garbage are mutually exclusive")
}
if *dryRun && !(*siteExpire) {
logc.Fatalln(ctx, "-dry-run is not applicable in this context")
@@ -536,10 +537,10 @@ func Main(versionInfo string) {
for _, record := range records {
parts := []string{
record.GetAuditID().String(),
color.HiWhiteString(record.GetTimestamp().AsTime().UTC().Format(time.RFC3339)),
color.HiWhiteString("%s", record.GetTimestamp().AsTime().UTC().Format(time.RFC3339)),
fmt.Sprint(record.GetEvent()),
color.HiGreenString(record.DescribeResource()),
color.HiMagentaString(record.DescribePrincipal()),
color.HiMagentaString("%s", record.DescribeResource()),
color.HiGreenString("%s", record.DescribePrincipal()),
}
if record.IsDetached() {
parts = append(parts,
@@ -713,47 +714,73 @@ func Main(versionInfo string) {
logc.Fatalln(ctx, err)
}
case *sizeHistogram != "":
extractSize := func(s *DomainStatistics) int64 { return 0 }
switch *sizeHistogram {
case "original":
// Displays a size histogram using the `manifest.OriginalSize`, which is useful to see
// which site is the closest to hitting the size limit (checked against apparent size).
// This apparent size does not have any direct relationship with used storage.
extractSize = func(s *DomainStatistics) int64 { return s.OriginalSize }
case "stored":
// Displays a size histogram using the `manifest.StoredSize`, which is useful to see
// which site consumes the most resources. The site is keeping at least this many
// bytes worth of blobs alive, but removing it may not free any space because
// deduplication is global.
extractSize = func(s *DomainStatistics) int64 { return s.StoredSize }
default:
logc.Fatalln(ctx, "unknown histogram type")
case *analyzeStorage == "text":
// datasize.ByteSize.HR() is a little too wide for the 8-char column.
formatSize := func(b datasize.ByteSize) string {
switch {
case b > datasize.GB:
return fmt.Sprintf("%.1fG", b.GBytes())
case b > datasize.MB:
return fmt.Sprintf("%.1fM", b.MBytes())
case b > datasize.KB:
return fmt.Sprintf("%.1fK", b.KBytes())
default:
return fmt.Sprintf("%dB", b)
}
}
histogram, err := SizeHistogram(ctx)
analysis, err := AnalyzeStorage(ctx)
if err != nil {
logc.Fatalln(ctx, err)
}
slices.SortFunc(histogram, func(a *DomainStatistics, b *DomainStatistics) int {
return cmp.Compare(extractSize(a), extractSize(b))
slices.SortFunc(analysis, func(a *StorageSize, b *StorageSize) int {
return cmp.Compare(a.TotalSize, b.TotalSize)
})
if len(histogram) > 0 {
fullScaleSize := max(extractSize(histogram[len(histogram)-1]), 1)
fullScaleWidth := int64(40)
for _, statistics := range histogram {
size := extractSize(statistics)
barWidth := size * fullScaleWidth / fullScaleSize
spaceWidth := fullScaleWidth - barWidth
bar := strings.Repeat("*", int(barWidth)) + strings.Repeat(" ", int(spaceWidth))
fmt.Fprintf(color.Output, "%s %s %s\n",
color.HiBlackString(fmt.Sprint("|", bar, "|")),
statistics.Domain,
color.HiGreenString(datasize.ByteSize(extractSize(statistics)).HR()),
for _, sizes := range analysis {
var colorize func(string, ...interface{}) string
fractionSize :=
float32(sizes.CurrentBlobSize) / float32(config.Limits.MaxSiteSize.Bytes())
switch {
case fractionSize > 0.9:
colorize = color.HiRedString
case fractionSize > 0.7:
colorize = color.HiYellowString
case fractionSize > 0.1:
colorize = color.HiGreenString
default:
colorize = color.HiWhiteString
}
if sizes.Domain != "*" {
fmt.Fprintf(color.Output, "%s\t%s\t%s\t%s\t%s\n",
colorize("%.0f%%", fractionSize*100.0),
colorize("%s", formatSize(datasize.ByteSize(sizes.CurrentSize))),
formatSize(datasize.ByteSize(sizes.NonCurrentSize)),
formatSize(datasize.ByteSize(sizes.TotalSize)),
color.HiMagentaString("%s", sizes.Domain),
)
} else {
fmt.Fprintf(color.Output, "---\t%s\t%s\t%s\t%s\n",
color.HiCyanString("%s", formatSize(datasize.ByteSize(sizes.CurrentSize))),
formatSize(datasize.ByteSize(sizes.NonCurrentSize)),
formatSize(datasize.ByteSize(sizes.TotalSize)),
color.HiMagentaString("%s", sizes.Domain),
)
}
}
fmt.Fprintf(color.Output, "Quota%%\tCurrent\tNonCurr\tTotal\tDomain\n")
case *analyzeStorage == "json":
analysis, err := AnalyzeStorage(ctx)
if err != nil {
logc.Fatalln(ctx, err)
}
encoder := json.NewEncoder(os.Stdout)
encoder.Encode(analysis)
case *analyzeStorage != "":
logc.Fatalf(ctx, "unsupported -analyze-storage mode")
case *traceGarbage:
if err = TraceGarbage(ctx); err != nil {