Collect statistics on blob reuse during archive upload.

This commit is contained in:
Catherine
2025-12-05 11:20:28 +00:00
parent 50d28f3c8b
commit faa486c779
3 changed files with 61 additions and 29 deletions

View File

@@ -5,6 +5,7 @@ import (
"archive/zip"
"bytes"
"compress/gzip"
"context"
"errors"
"fmt"
"io"
@@ -25,24 +26,30 @@ func boundArchiveStream(reader io.Reader) io.Reader {
fmt.Errorf("%w: %s limit exceeded", ErrArchiveTooLarge, config.Limits.MaxSiteSize.HR()))
}
func ExtractGzip(reader io.Reader, next func(io.Reader) (*Manifest, error)) (*Manifest, error) {
func ExtractGzip(
ctx context.Context, reader io.Reader,
next func(context.Context, io.Reader) (*Manifest, error),
) (*Manifest, error) {
stream, err := gzip.NewReader(reader)
if err != nil {
return nil, err
}
defer stream.Close()
return next(boundArchiveStream(stream))
return next(ctx, boundArchiveStream(stream))
}
func ExtractZstd(reader io.Reader, next func(io.Reader) (*Manifest, error)) (*Manifest, error) {
func ExtractZstd(
ctx context.Context, reader io.Reader,
next func(context.Context, io.Reader) (*Manifest, error),
) (*Manifest, error) {
stream, err := zstd.NewReader(reader)
if err != nil {
return nil, err
}
defer stream.Close()
return next(boundArchiveStream(stream))
return next(ctx, boundArchiveStream(stream))
}
// Returns a map of git hash to entry. If `manifest` is nil, returns an empty map.
@@ -62,21 +69,26 @@ func indexManifestByGitHash(manifest *Manifest) map[string]*Entry {
func addSymlinkOrBlobReference(
manifest *Manifest, fileName string, target string, index map[string]*Entry,
) {
) *Entry {
if hash, found := strings.CutPrefix(target, BlobReferencePrefix); found {
if entry, found := index[hash]; found {
manifest.Contents[fileName] = entry
return entry
} else {
AddProblem(manifest, fileName, "unresolved reference: %s", target)
return nil
}
} else {
AddSymlink(manifest, fileName, target)
return AddSymlink(manifest, fileName, target)
}
}
func ExtractTar(reader io.Reader, oldManifest *Manifest) (*Manifest, error) {
func ExtractTar(ctx context.Context, reader io.Reader, oldManifest *Manifest) (*Manifest, error) {
archive := tar.NewReader(reader)
var dataBytesRecycled int64
var dataBytesTransferred int64
index := indexManifestByGitHash(oldManifest)
manifest := NewManifest()
for {
@@ -105,8 +117,10 @@ func ExtractTar(reader io.Reader, oldManifest *Manifest) (*Manifest, error) {
return nil, fmt.Errorf("tar: %s: %w", fileName, err)
}
AddFile(manifest, fileName, fileData)
dataBytesTransferred += int64(len(fileData))
case tar.TypeSymlink:
addSymlinkOrBlobReference(manifest, fileName, header.Linkname, index)
entry := addSymlinkOrBlobReference(manifest, fileName, header.Linkname, index)
dataBytesRecycled += entry.GetOriginalSize()
case tar.TypeDir:
AddDirectory(manifest, fileName)
default:
@@ -114,10 +128,17 @@ func ExtractTar(reader io.Reader, oldManifest *Manifest) (*Manifest, error) {
continue
}
}
logc.Printf(ctx,
"reuse: %s recycled, %s transferred\n",
datasize.ByteSize(dataBytesRecycled).HR(),
datasize.ByteSize(dataBytesTransferred).HR(),
)
return manifest, nil
}
func ExtractZip(reader io.Reader, oldManifest *Manifest) (*Manifest, error) {
func ExtractZip(ctx context.Context, reader io.Reader, oldManifest *Manifest) (*Manifest, error) {
data, err := io.ReadAll(reader)
if err != nil {
return nil, err
@@ -141,6 +162,9 @@ func ExtractZip(reader io.Reader, oldManifest *Manifest) (*Manifest, error) {
)
}
var dataBytesRecycled int64
var dataBytesTransferred int64
index := indexManifestByGitHash(oldManifest)
manifest := NewManifest()
for _, file := range archive.File {
@@ -159,11 +183,20 @@ func ExtractZip(reader io.Reader, oldManifest *Manifest) (*Manifest, error) {
}
if file.Mode()&os.ModeSymlink != 0 {
addSymlinkOrBlobReference(manifest, file.Name, string(fileData), index)
entry := addSymlinkOrBlobReference(manifest, file.Name, string(fileData), index)
dataBytesRecycled += entry.GetOriginalSize()
} else {
AddFile(manifest, file.Name, fileData)
dataBytesTransferred += int64(len(fileData))
}
}
}
logc.Printf(ctx,
"reuse: %s recycled, %s transferred\n",
datasize.ByteSize(dataBytesRecycled).HR(),
datasize.ByteSize(dataBytesTransferred).HR(),
)
return manifest, nil
}

View File

@@ -131,9 +131,8 @@ func FetchRepository(
}
// Collect checkout statistics.
var dataBytesFromOldManifest int64
var dataBytesFromGitCheckout int64
var dataBytesFromGitTransport int64
var dataBytesRecycled int64
var dataBytesTransferred int64
// First, see if we can extract the blobs from the old manifest. This is the preferred option
// because it avoids both network transfers and recompression. Note that we do not request
@@ -143,7 +142,7 @@ func FetchRepository(
if manifestEntry, found := blobsNeeded[hash]; found {
manifestEntry.Reset()
proto.Merge(manifestEntry, oldManifestEntry)
dataBytesFromOldManifest += oldManifestEntry.GetOriginalSize()
dataBytesRecycled += oldManifestEntry.GetOriginalSize()
delete(blobsNeeded, hash)
}
}
@@ -154,7 +153,7 @@ func FetchRepository(
// clone despite asking for a partial clone.
for hash, manifestEntry := range blobsNeeded {
if err := readGitBlob(repo, hash, manifestEntry); err == nil {
dataBytesFromGitCheckout += manifestEntry.GetOriginalSize()
dataBytesTransferred += manifestEntry.GetOriginalSize()
delete(blobsNeeded, hash)
}
}
@@ -197,15 +196,15 @@ func FetchRepository(
if err := readGitBlob(repo, hash, manifestEntry); err != nil {
return nil, err
}
dataBytesFromGitTransport += manifestEntry.GetOriginalSize()
dataBytesTransferred += manifestEntry.GetOriginalSize()
delete(blobsNeeded, hash)
}
}
logc.Printf(ctx,
"fetch: %s reused, %s received\n",
datasize.ByteSize(dataBytesFromOldManifest).HR(),
datasize.ByteSize(dataBytesFromGitCheckout+dataBytesFromGitTransport).HR(),
"reuse: %s recycled, %s transferred\n",
datasize.ByteSize(dataBytesRecycled).HR(),
datasize.ByteSize(dataBytesTransferred).HR(),
)
return manifest, nil

View File

@@ -125,24 +125,24 @@ func UpdateFromArchive(
// Ignore errors; worst case we have to re-fetch all of the blobs.
oldManifest, _, _ := backend.GetManifest(ctx, webRoot, GetManifestOptions{})
extractTar := func(reader io.Reader) (*Manifest, error) {
return ExtractTar(reader, oldManifest)
extractTar := func(ctx context.Context, reader io.Reader) (*Manifest, error) {
return ExtractTar(ctx, reader, oldManifest)
}
var newManifest *Manifest
switch contentType {
case "application/x-tar":
logc.Printf(ctx, "update %s: (tar)", webRoot)
newManifest, err = extractTar(reader) // yellow?
newManifest, err = extractTar(ctx, reader) // yellow?
case "application/x-tar+gzip":
logc.Printf(ctx, "update %s: (tar.gz)", webRoot)
newManifest, err = ExtractGzip(reader, extractTar) // definitely yellow.
newManifest, err = ExtractGzip(ctx, reader, extractTar) // definitely yellow.
case "application/x-tar+zstd":
logc.Printf(ctx, "update %s: (tar.zst)", webRoot)
newManifest, err = ExtractZstd(reader, extractTar)
newManifest, err = ExtractZstd(ctx, reader, extractTar)
case "application/zip":
logc.Printf(ctx, "update %s: (zip)", webRoot)
newManifest, err = ExtractZip(reader, oldManifest)
newManifest, err = ExtractZip(ctx, reader, oldManifest)
default:
err = errArchiveFormat
}
@@ -177,7 +177,7 @@ func PartialUpdateFromArchive(
return UpdateResult{UpdateError, nil, err}
}
applyTarPatch := func(reader io.Reader) (*Manifest, error) {
applyTarPatch := func(ctx context.Context, reader io.Reader) (*Manifest, error) {
// Clone the manifest before starting to mutate it. `GetManifest` may return cached
// `*Manifest` objects, which should never be mutated.
newManifest := &Manifest{}
@@ -193,13 +193,13 @@ func PartialUpdateFromArchive(
switch contentType {
case "application/x-tar":
logc.Printf(ctx, "patch %s: (tar)", webRoot)
newManifest, err = applyTarPatch(reader)
newManifest, err = applyTarPatch(ctx, reader)
case "application/x-tar+gzip":
logc.Printf(ctx, "patch %s: (tar.gz)", webRoot)
newManifest, err = ExtractGzip(reader, applyTarPatch)
newManifest, err = ExtractGzip(ctx, reader, applyTarPatch)
case "application/x-tar+zstd":
logc.Printf(ctx, "patch %s: (tar.zst)", webRoot)
newManifest, err = ExtractZstd(reader, applyTarPatch)
newManifest, err = ExtractZstd(ctx, reader, applyTarPatch)
default:
err = errArchiveFormat
}