Files
git-pages/src/fetch.go

275 lines
8.1 KiB
Go

package git_pages
import (
"context"
"errors"
"fmt"
"io"
"maps"
"net/url"
"os"
"slices"
"strings"
"github.com/c2h5oh/datasize"
"github.com/go-git/go-billy/v6/osfs"
"github.com/go-git/go-git/v6"
"github.com/go-git/go-git/v6/plumbing"
"github.com/go-git/go-git/v6/plumbing/cache"
"github.com/go-git/go-git/v6/plumbing/filemode"
"github.com/go-git/go-git/v6/plumbing/object"
"github.com/go-git/go-git/v6/plumbing/protocol/packp"
"github.com/go-git/go-git/v6/plumbing/transport"
"github.com/go-git/go-git/v6/storage/filesystem"
"google.golang.org/protobuf/proto"
)
var ErrRepositoryTooLarge = errors.New("repository too large")
func FetchRepository(
ctx context.Context, repoURL string, branch string, oldManifest *Manifest,
) (
*Manifest, error,
) {
span, ctx := ObserveFunction(ctx, "FetchRepository",
"git.repository", repoURL, "git.branch", branch)
defer span.Finish()
parsedRepoURL, err := url.Parse(repoURL)
if err != nil {
return nil, fmt.Errorf("URL parse: %w", err)
}
var repo *git.Repository
var storer *filesystem.Storage
for _, filter := range []packp.Filter{packp.FilterBlobNone(), packp.Filter("")} {
var tempDir string
if tempDir, err = os.MkdirTemp("", "fetchRepo"); err != nil {
return nil, fmt.Errorf("mkdtemp: %w", err)
}
defer os.RemoveAll(tempDir)
storer = filesystem.NewStorageWithOptions(
osfs.New(tempDir, osfs.WithBoundOS()),
cache.NewObjectLRUDefault(),
filesystem.Options{
ExclusiveAccess: true,
LargeObjectThreshold: int64(config.Limits.GitLargeObjectThreshold.Bytes()),
},
)
repo, err = git.CloneContext(ctx, storer, nil, &git.CloneOptions{
Bare: true,
URL: repoURL,
ReferenceName: plumbing.NewBranchReferenceName(branch),
SingleBranch: true,
Depth: 1,
Tags: git.NoTags,
Filter: filter,
})
if err != nil {
logc.Printf(ctx, "clone err: %s %s filter=%q\n", repoURL, branch, filter)
continue
} else {
logc.Printf(ctx, "clone ok: %s %s filter=%q\n", repoURL, branch, filter)
break
}
}
if err != nil {
return nil, fmt.Errorf("git clone: %w", err)
}
ref, err := repo.Head()
if err != nil {
return nil, fmt.Errorf("git head: %w", err)
}
commit, err := repo.CommitObject(ref.Hash())
if err != nil {
return nil, fmt.Errorf("git commit: %w", err)
}
tree, err := repo.TreeObject(commit.TreeHash)
if err != nil {
return nil, fmt.Errorf("git tree: %w", err)
}
walker := object.NewTreeWalker(tree, true, make(map[plumbing.Hash]bool))
defer walker.Close()
// Create a manifest for the tree object corresponding to `branch`, but do not populate it
// with data yet; instead, record all the blobs we'll need.
manifest := NewManifest()
manifest.RepoUrl = proto.String(repoURL)
manifest.Branch = proto.String(branch)
manifest.Commit = proto.String(ref.Hash().String())
blobsNeeded := map[plumbing.Hash]*Entry{}
for {
name, entry, err := walker.Next()
if err == io.EOF {
break
} else if err != nil {
return nil, fmt.Errorf("git walker: %w", err)
} else {
manifestEntry := &Entry{}
if existingManifestEntry, found := blobsNeeded[entry.Hash]; found {
// If the same blob is present twice, we only need to fetch it once (and both
// instances will alias the same `Entry` structure in the manifest).
manifestEntry = existingManifestEntry
} else if entry.Mode.IsFile() {
blobsNeeded[entry.Hash] = manifestEntry
if entry.Mode == filemode.Symlink {
manifestEntry.Type = Type_Symlink.Enum()
} else {
manifestEntry.Type = Type_InlineFile.Enum()
}
manifestEntry.GitHash = proto.String(entry.Hash.String())
} else if entry.Mode == filemode.Dir {
manifestEntry.Type = Type_Directory.Enum()
} else {
AddProblem(manifest, name, "unsupported mode %#o", entry.Mode)
continue
}
manifest.Contents[name] = manifestEntry
}
}
// Collect checkout statistics.
var dataBytesRecycled int64
var dataBytesTransferred int64
// First, see if we can extract the blobs from the old manifest. This is the preferred option
// because it avoids both network transfers and recompression. Note that we do not request
// blobs from the backend under any circumstances to avoid creating a blob existence oracle.
for _, oldManifestEntry := range oldManifest.GetContents() {
if hash, ok := plumbing.FromHex(oldManifestEntry.GetGitHash()); ok {
if manifestEntry, found := blobsNeeded[hash]; found {
manifestEntry.Reset()
proto.Merge(manifestEntry, oldManifestEntry)
dataBytesRecycled += oldManifestEntry.GetOriginalSize()
delete(blobsNeeded, hash)
}
}
}
// Second, fill the manifest entries with data from the git checkout we just made.
// This will only succeed if a `blob:none` filter isn't supported and we got a full
// clone despite asking for a partial clone.
for hash, manifestEntry := range blobsNeeded {
if err := readGitBlob(repo, hash, manifestEntry, &dataBytesTransferred); err == nil {
delete(blobsNeeded, hash)
} else if errors.Is(err, ErrRepositoryTooLarge) {
return nil, err
}
}
// Third, if we still don't have data for some manifest entries, re-establish a git transport
// and request the missing blobs (only) from the server.
if len(blobsNeeded) > 0 {
client, err := transport.Get(parsedRepoURL.Scheme)
if err != nil {
return nil, fmt.Errorf("git transport: %w", err)
}
endpoint, err := transport.NewEndpoint(repoURL)
if err != nil {
return nil, fmt.Errorf("git endpoint: %w", err)
}
session, err := client.NewSession(storer, endpoint, nil)
if err != nil {
return nil, fmt.Errorf("git session: %w", err)
}
connection, err := session.Handshake(ctx, transport.UploadPackService)
if err != nil {
return nil, fmt.Errorf("git connection: %w", err)
}
defer connection.Close()
if err := connection.Fetch(ctx, &transport.FetchRequest{
Wants: slices.Collect(maps.Keys(blobsNeeded)),
Depth: 1,
// Git CLI behaves like this, even if the wants above are references to blobs.
Filter: "blob:none",
}); err != nil && !errors.Is(err, transport.ErrNoChange) {
return nil, fmt.Errorf("git blob fetch request: %w", err)
}
// All remaining blobs should now be available.
for hash, manifestEntry := range blobsNeeded {
if err := readGitBlob(repo, hash, manifestEntry, &dataBytesTransferred); err != nil {
return nil, err
}
delete(blobsNeeded, hash)
}
}
logc.Printf(ctx,
"reuse: %s recycled, %s transferred\n",
datasize.ByteSize(dataBytesRecycled).HR(),
datasize.ByteSize(dataBytesTransferred).HR(),
)
warnAboutGitLFS(ctx, manifest)
return manifest, nil
}
func readGitBlob(
repo *git.Repository, hash plumbing.Hash, entry *Entry, bytesTransferred *int64,
) error {
blob, err := repo.BlobObject(hash)
if err != nil {
return fmt.Errorf("git blob %s: %w", hash, err)
}
reader, err := blob.Reader()
if err != nil {
return fmt.Errorf("git blob open: %w", err)
}
defer reader.Close()
data, err := io.ReadAll(reader)
if err != nil {
return fmt.Errorf("git blob read: %w", err)
}
switch entry.GetType() {
case Type_InlineFile, Type_Symlink:
// okay
default:
panic(fmt.Errorf("readGitBlob encountered invalid entry: %v, %v",
entry.GetType(), entry.GetTransform()))
}
entry.Data = data
entry.Transform = Transform_Identity.Enum()
entry.OriginalSize = proto.Int64(blob.Size)
entry.CompressedSize = proto.Int64(blob.Size)
*bytesTransferred += blob.Size
if uint64(*bytesTransferred) > config.Limits.MaxSiteSize.Bytes() {
return fmt.Errorf("%w: fetch exceeds %s limit",
ErrRepositoryTooLarge,
config.Limits.MaxSiteSize.HR(),
)
}
return nil
}
func warnAboutGitLFS(ctx context.Context, manifest *Manifest) {
gitattributes := ReadGitAttributes(ctx, manifest)
for _, name := range slices.Sorted(maps.Keys(manifest.GetContents())) {
entry := manifest.GetContents()[name]
if !IsEntryRegularFile(entry) {
continue
}
parts := strings.Split(name, "/")
attrs, _ := gitattributes.Match(parts, nil)
if attr, ok := attrs["filter"]; ok && attr.Value() == "lfs" {
AddProblem(manifest, name, "git-pages does not support Git LFS; move this file into Git or use incremental uploads")
}
}
}