From 50d28f3c8b4181e605da911635a3386029306fd5 Mon Sep 17 00:00:00 2001 From: Catherine Date: Fri, 5 Dec 2025 10:52:54 +0000 Subject: [PATCH] Resolve `/git/blobs/` symlinks as blob references to the old manifest. This will be used for incremental archive uploads. --- src/extract.go | 42 ++++++++++++++++++++++++++++++++++++++---- src/manifest.go | 29 +++++++++++++++++------------ src/update.go | 14 +++++++++----- 3 files changed, 64 insertions(+), 21 deletions(-) diff --git a/src/extract.go b/src/extract.go index e97e188..e8d39cd 100644 --- a/src/extract.go +++ b/src/extract.go @@ -12,11 +12,14 @@ import ( "strings" "github.com/c2h5oh/datasize" + "github.com/go-git/go-git/v6/plumbing" "github.com/klauspost/compress/zstd" ) var ErrArchiveTooLarge = errors.New("archive too large") +const BlobReferencePrefix = "/git/blobs/" + func boundArchiveStream(reader io.Reader) io.Reader { return ReadAtMost(reader, int64(config.Limits.MaxSiteSize.Bytes()), fmt.Errorf("%w: %s limit exceeded", ErrArchiveTooLarge, config.Limits.MaxSiteSize.HR())) @@ -42,9 +45,39 @@ func ExtractZstd(reader io.Reader, next func(io.Reader) (*Manifest, error)) (*Ma return next(boundArchiveStream(stream)) } -func ExtractTar(reader io.Reader) (*Manifest, error) { +// Returns a map of git hash to entry. If `manifest` is nil, returns an empty map. +func indexManifestByGitHash(manifest *Manifest) map[string]*Entry { + index := map[string]*Entry{} + for _, entry := range manifest.GetContents() { + if hash := entry.GetGitHash(); hash != "" { + if _, ok := plumbing.FromHex(hash); ok { + index[hash] = entry + } else { + panic(fmt.Errorf("index: malformed hash: %s", hash)) + } + } + } + return index +} + +func addSymlinkOrBlobReference( + manifest *Manifest, fileName string, target string, index map[string]*Entry, +) { + if hash, found := strings.CutPrefix(target, BlobReferencePrefix); found { + if entry, found := index[hash]; found { + manifest.Contents[fileName] = entry + } else { + AddProblem(manifest, fileName, "unresolved reference: %s", target) + } + } else { + AddSymlink(manifest, fileName, target) + } +} + +func ExtractTar(reader io.Reader, oldManifest *Manifest) (*Manifest, error) { archive := tar.NewReader(reader) + index := indexManifestByGitHash(oldManifest) manifest := NewManifest() for { header, err := archive.Next() @@ -73,7 +106,7 @@ func ExtractTar(reader io.Reader) (*Manifest, error) { } AddFile(manifest, fileName, fileData) case tar.TypeSymlink: - AddSymlink(manifest, fileName, header.Linkname) + addSymlinkOrBlobReference(manifest, fileName, header.Linkname, index) case tar.TypeDir: AddDirectory(manifest, fileName) default: @@ -84,7 +117,7 @@ func ExtractTar(reader io.Reader) (*Manifest, error) { return manifest, nil } -func ExtractZip(reader io.Reader) (*Manifest, error) { +func ExtractZip(reader io.Reader, oldManifest *Manifest) (*Manifest, error) { data, err := io.ReadAll(reader) if err != nil { return nil, err @@ -108,6 +141,7 @@ func ExtractZip(reader io.Reader) (*Manifest, error) { ) } + index := indexManifestByGitHash(oldManifest) manifest := NewManifest() for _, file := range archive.File { if strings.HasSuffix(file.Name, "/") { @@ -125,7 +159,7 @@ func ExtractZip(reader io.Reader) (*Manifest, error) { } if file.Mode()&os.ModeSymlink != 0 { - AddSymlink(manifest, file.Name, string(fileData)) + addSymlinkOrBlobReference(manifest, file.Name, string(fileData), index) } else { AddFile(manifest, file.Name, fileData) } diff --git a/src/manifest.go b/src/manifest.go index d9da487..4d2bd14 100644 --- a/src/manifest.go +++ b/src/manifest.go @@ -104,7 +104,7 @@ func NewManifestEntry(type_ Type, data []byte) *Entry { return entry } -func AddFile(manifest *Manifest, path string, data []byte) *Entry { +func AddFile(manifest *Manifest, fileName string, data []byte) *Entry { // Fill in `git_hash` even for files not originating from git using the SHA256 algorithm; // we use this primarily for incremental archive uploads, but when support for git SHA256 // repositories is complete, archive uploads and git checkouts will have cross-support for @@ -113,30 +113,35 @@ func AddFile(manifest *Manifest, path string, data []byte) *Entry { hasher.Write(data) entry := NewManifestEntry(Type_InlineFile, data) entry.GitHash = proto.String(hasher.Sum().String()) - manifest.Contents[path] = entry + manifest.Contents[fileName] = entry return entry } -func AddSymlink(manifest *Manifest, path string, target string) *Entry { - entry := NewManifestEntry(Type_Symlink, []byte(target)) - manifest.Contents[path] = entry - return entry +func AddSymlink(manifest *Manifest, fileName string, target string) *Entry { + if path.IsAbs(target) { + AddProblem(manifest, fileName, "absolute symlink: %s", target) + return nil + } else { + entry := NewManifestEntry(Type_Symlink, []byte(target)) + manifest.Contents[fileName] = entry + return entry + } } -func AddDirectory(manifest *Manifest, path string) *Entry { - path = strings.TrimSuffix(path, "/") +func AddDirectory(manifest *Manifest, dirName string) *Entry { + dirName = strings.TrimSuffix(dirName, "/") entry := NewManifestEntry(Type_Directory, nil) - manifest.Contents[path] = entry + manifest.Contents[dirName] = entry return entry } -func AddProblem(manifest *Manifest, path, format string, args ...any) error { +func AddProblem(manifest *Manifest, pathName, format string, args ...any) error { cause := fmt.Sprintf(format, args...) manifest.Problems = append(manifest.Problems, &Problem{ - Path: proto.String(path), + Path: proto.String(pathName), Cause: proto.String(cause), }) - return fmt.Errorf("%s: %s", path, cause) + return fmt.Errorf("%s: %s", pathName, cause) } func GetProblemReport(manifest *Manifest) []string { diff --git a/src/update.go b/src/update.go index e0a6af5..8d90a2b 100644 --- a/src/update.go +++ b/src/update.go @@ -122,23 +122,27 @@ func UpdateFromArchive( ) (result UpdateResult) { var err error - // Ignore errors; here the old manifest is used only to determine the update outcome. + // Ignore errors; worst case we have to re-fetch all of the blobs. oldManifest, _, _ := backend.GetManifest(ctx, webRoot, GetManifestOptions{}) + extractTar := func(reader io.Reader) (*Manifest, error) { + return ExtractTar(reader, oldManifest) + } + var newManifest *Manifest switch contentType { case "application/x-tar": logc.Printf(ctx, "update %s: (tar)", webRoot) - newManifest, err = ExtractTar(reader) // yellow? + newManifest, err = extractTar(reader) // yellow? case "application/x-tar+gzip": logc.Printf(ctx, "update %s: (tar.gz)", webRoot) - newManifest, err = ExtractGzip(reader, ExtractTar) // definitely yellow. + newManifest, err = ExtractGzip(reader, extractTar) // definitely yellow. case "application/x-tar+zstd": logc.Printf(ctx, "update %s: (tar.zst)", webRoot) - newManifest, err = ExtractZstd(reader, ExtractTar) + newManifest, err = ExtractZstd(reader, extractTar) case "application/zip": logc.Printf(ctx, "update %s: (zip)", webRoot) - newManifest, err = ExtractZip(reader) + newManifest, err = ExtractZip(reader, oldManifest) default: err = errArchiveFormat }