Allow downloading entire site via CLI or HTTP.

The HTTP endpoint is `/.git-pages/archive.tar` and it is gated behind
a feature flag `archive-site`. It serially downloads every blob and
writes it to the client in a chunked response, optionally compressed
with gzip or zstd as per `Accept-Encoding:`. It is authorized the same
as `/.git-pages/manifest.json`, for the same reasons.

The CLI operation is `-get-archive <site-name>` and it writes a tar
archive to stdout. This could be useful for an administrator to review
the contents of a site in response to a report.

Both `_headers` and `_redirects` files are present in the output,
reconstituted from the manifest.
This commit is contained in:
Catherine
2025-11-20 00:37:16 +00:00
parent aa6e495505
commit 6db850e2c4
9 changed files with 264 additions and 40 deletions

View File

@@ -43,7 +43,7 @@
"-s -w"
];
vendorHash = "sha256-UQl8AeijqJd2qpVZBDuHT/+Dtd3+Uwrf4w4yAOaFs98=";
vendorHash = "sha256-oVXELOXbRTzzU8pUGNE4K552thlZXGAX7qpv6ETwz6o=";
};
in
{

2
go.mod
View File

@@ -3,7 +3,7 @@ module codeberg.org/git-pages/git-pages
go 1.25.0
require (
codeberg.org/git-pages/go-headers v1.0.0
codeberg.org/git-pages/go-headers v1.1.0
github.com/KimMachineGun/automemlimit v0.7.5
github.com/c2h5oh/datasize v0.0.0-20231215233829-aa82cc1e6500
github.com/creasty/defaults v1.8.0

2
go.sum
View File

@@ -1,5 +1,7 @@
codeberg.org/git-pages/go-headers v1.0.0 h1:hvGU97hQdXaT5HwCpZJWQdg7akvtOBCSUNL4u2a5uTs=
codeberg.org/git-pages/go-headers v1.0.0/go.mod h1:N4gwH0U3YPwmuyxqH7xBA8j44fTPX+vOEP7ejJVBPts=
codeberg.org/git-pages/go-headers v1.1.0 h1:rk7/SOSsn+XuL7PUQZFYUaWKHEaj6K8mXmUV9rF2VxE=
codeberg.org/git-pages/go-headers v1.1.0/go.mod h1:N4gwH0U3YPwmuyxqH7xBA8j44fTPX+vOEP7ejJVBPts=
github.com/KimMachineGun/automemlimit v0.7.5 h1:RkbaC0MwhjL1ZuBKunGDjE/ggwAX43DwZrJqVwyveTk=
github.com/KimMachineGun/automemlimit v0.7.5/go.mod h1:QZxpHaGOQoYvFhv/r4u3U0JTC2ZcOwbSr11UZF46UBM=
github.com/Microsoft/go-winio v0.6.2 h1:F2VQgta7ecxGYO8k3ZZz3RS8fVIXVxONVUPlNERoyfY=

View File

@@ -310,6 +310,7 @@ func authorizeCodebergPagesV2(r *http.Request) (*Authorization, error) {
}
}
// Checks whether an operation that enables enumerating site contents is allowed.
func AuthorizeMetadataRetrieval(r *http.Request) (*Authorization, error) {
causes := []error{AuthError{http.StatusUnauthorized, "unauthorized"}}

126
src/collect.go Normal file
View File

@@ -0,0 +1,126 @@
package git_pages
import (
"archive/tar"
"context"
"fmt"
"io"
"time"
)
type Flusher interface {
Flush() error
}
// Inverse of `ExtractTar`.
func CollectTar(
context context.Context, writer io.Writer, manifest *Manifest, manifestMtime time.Time,
) (
err error,
) {
archive := tar.NewWriter(writer)
appendFile := func(header *tar.Header, data []byte, transform Transform) (err error) {
switch transform {
case Transform_None:
case Transform_Zstandard:
data, err = zstdDecoder.DecodeAll(data, []byte{})
if err != nil {
return err
}
default:
return fmt.Errorf("unexpected transform")
}
header.Size = int64(len(data))
err = archive.WriteHeader(header)
if err != nil {
return
}
_, err = archive.Write(data)
return
}
for fileName, entry := range manifest.Contents {
var header tar.Header
if fileName == "" {
continue
}
header.Name = fileName
switch entry.GetType() {
case Type_Directory:
header.Typeflag = tar.TypeDir
header.Mode = 0755
header.ModTime = manifestMtime
err = appendFile(&header, nil, Transform_None)
case Type_InlineFile:
header.Typeflag = tar.TypeReg
header.Mode = 0644
header.ModTime = manifestMtime
err = appendFile(&header, entry.GetData(), entry.GetTransform())
case Type_ExternalFile:
var blobReader io.Reader
var blobMtime time.Time
var blobData []byte
blobReader, _, blobMtime, err = backend.GetBlob(context, string(entry.Data))
if err != nil {
return
}
blobData, _ = io.ReadAll(blobReader)
header.Typeflag = tar.TypeReg
header.Mode = 0644
header.ModTime = blobMtime
err = appendFile(&header, blobData, entry.GetTransform())
case Type_Symlink:
header.Typeflag = tar.TypeSymlink
header.Mode = 0644
header.ModTime = manifestMtime
err = appendFile(&header, entry.GetData(), Transform_None)
default:
return fmt.Errorf("unexpected entry type")
}
if err != nil {
return err
}
}
if redirects := CollectRedirectsFile(manifest); redirects != "" {
err = appendFile(&tar.Header{
Name: RedirectsFileName,
Typeflag: tar.TypeReg,
Mode: 0644,
ModTime: manifestMtime,
}, []byte(redirects), Transform_None)
if err != nil {
return err
}
}
if headers := CollectHeadersFile(manifest); headers != "" {
err = appendFile(&tar.Header{
Name: HeadersFileName,
Typeflag: tar.TypeReg,
Mode: 0644,
ModTime: manifestMtime,
}, []byte(headers), Transform_None)
if err != nil {
return err
}
}
err = archive.Flush()
if err != nil {
return err
}
flusher, ok := writer.(Flusher)
if ok {
err = flusher.Flush()
}
return err
}

View File

@@ -15,7 +15,7 @@ import (
var ErrHeaderNotAllowed = errors.New("custom header not allowed")
const headersFileName string = "_headers"
const HeadersFileName string = "_headers"
// Lifted from https://docs.netlify.com/manage/routing/headers/, except for `Set-Cookie`
// the rationale for which does not apply in our environment.
@@ -86,24 +86,24 @@ func validateHeaderRule(rule headers.Rule) error {
// Parses redirects file and injects rules into the manifest.
func ProcessHeadersFile(manifest *Manifest) error {
headersEntry := manifest.Contents[headersFileName]
delete(manifest.Contents, headersFileName)
headersEntry := manifest.Contents[HeadersFileName]
delete(manifest.Contents, HeadersFileName)
if headersEntry == nil {
return nil
} else if headersEntry.GetType() != Type_InlineFile {
return AddProblem(manifest, headersFileName,
return AddProblem(manifest, HeadersFileName,
"not a regular file")
}
rules, err := headers.ParseString(string(headersEntry.GetData()))
if err != nil {
return AddProblem(manifest, headersFileName,
return AddProblem(manifest, HeadersFileName,
"syntax error: %s", err)
}
for index, rule := range rules {
if err := validateHeaderRule(rule); err != nil {
AddProblem(manifest, headersFileName,
AddProblem(manifest, HeadersFileName,
"rule #%d %q: %s", index+1, rule.Path, err)
continue
}
@@ -122,6 +122,21 @@ func ProcessHeadersFile(manifest *Manifest) error {
return nil
}
func CollectHeadersFile(manifest *Manifest) string {
var headersRules []headers.Rule
for _, manifestRule := range manifest.GetHeaders() {
headersRule := headers.Rule{
Path: manifestRule.GetPath(),
Headers: http.Header{},
}
for _, manifestHeader := range manifestRule.GetHeaderMap() {
headersRule.Headers[manifestHeader.GetName()] = manifestHeader.GetValues()
}
headersRules = append(headersRules, headersRule)
}
return headers.Must(headers.UnparseString(headersRules))
}
func ApplyHeaderRules(manifest *Manifest, url *url.URL) (headers http.Header, err error) {
headers = http.Header{}
fromSegments := pathSegments(url.Path)

View File

@@ -69,6 +69,18 @@ func serve(listener net.Listener, handler http.Handler) {
}
}
func webRootArg(arg string) string {
switch strings.Count(arg, "/") {
case 0:
return arg + "/.index"
case 1:
return arg
default:
log.Fatalf("webroot argument must be either 'domain.tld' or 'domain.tld/dir")
return ""
}
}
func Main() {
printConfigEnvVars := flag.Bool("print-config-env-vars", false,
"print every recognized configuration environment variable and exit")
@@ -80,16 +92,28 @@ func Main() {
"run without configuration file (configure via environment variables)")
runMigration := flag.String("run-migration", "",
"run a specific store migration (available: \"create-domain-markers\")")
getManifest := flag.String("get-manifest", "",
"write manifest for `webroot` (either 'domain.tld' or 'domain.tld/dir') to stdout as ProtoJSON")
getBlob := flag.String("get-blob", "",
"write `blob` ('sha256-xxxxxxx...xxx') to stdout")
"write contents of `blob-ref` ('sha256-xxxxxxx...xxx') to stdout")
getManifest := flag.String("get-manifest", "",
"write manifest for `site-name` (either 'domain.tld' or 'domain.tld/dir') to stdout as ProtoJSON")
getArchive := flag.String("get-archive", "",
"write archive for `site-name` (either 'domain.tld' or 'domain.tld/dir') to stdout in tar format")
updateSite := flag.String("update-site", "",
"update site for `webroot` (either 'domain.tld' or 'domain.tld/dir') from archive or repository URL")
"update site for `site-name` (either 'domain.tld' or 'domain.tld/dir') from archive or repository URL")
flag.Parse()
if *getManifest != "" && *getBlob != "" {
log.Fatalln("-get-manifest and -get-blob are mutually exclusive")
var cliOperations int
if *getBlob != "" {
cliOperations += 1
}
if *getManifest != "" {
cliOperations += 1
}
if *getArchive != "" {
cliOperations += 1
}
if cliOperations > 1 {
log.Fatalln("-get-blob, -get-manifest, and -get-archive are mutually exclusive")
}
if *configTomlPath != "" && *noConfig {
@@ -150,22 +174,6 @@ func Main() {
log.Fatalln(err)
}
case *getManifest != "":
if err := ConfigureBackend(&config.Storage); err != nil {
log.Fatalln(err)
}
webRoot := *getManifest
if !strings.Contains(webRoot, "/") {
webRoot += "/.index"
}
manifest, _, err := backend.GetManifest(context.Background(), webRoot, GetManifestOptions{})
if err != nil {
log.Fatalln(err)
}
fmt.Println(ManifestDebugJSON(manifest))
case *getBlob != "":
if err := ConfigureBackend(&config.Storage); err != nil {
log.Fatalln(err)
@@ -178,6 +186,31 @@ func Main() {
io.Copy(os.Stdout, reader)
case *getManifest != "":
if err := ConfigureBackend(&config.Storage); err != nil {
log.Fatalln(err)
}
webRoot := webRootArg(*getManifest)
manifest, _, err := backend.GetManifest(context.Background(), webRoot, GetManifestOptions{})
if err != nil {
log.Fatalln(err)
}
fmt.Println(ManifestDebugJSON(manifest))
case *getArchive != "":
if err := ConfigureBackend(&config.Storage); err != nil {
log.Fatalln(err)
}
webRoot := webRootArg(*getArchive)
manifest, manifestMtime, err :=
backend.GetManifest(context.Background(), webRoot, GetManifestOptions{})
if err != nil {
log.Fatalln(err)
}
CollectTar(context.Background(), os.Stdout, manifest, manifestMtime)
case *updateSite != "":
if err := ConfigureBackend(&config.Storage); err != nil {
log.Fatalln(err)

View File

@@ -2,6 +2,7 @@ package git_pages
import (
"bytes"
"compress/gzip"
"context"
"encoding/json"
"errors"
@@ -159,13 +160,14 @@ func getPage(w http.ResponseWriter, r *http.Request) error {
}
if metadataPath, found := strings.CutPrefix(sitePath, ".git-pages/"); found {
lastModified := manifestMtime.UTC().Format(http.TimeFormat)
switch metadataPath {
case "health":
switch {
case metadataPath == "health":
w.Header().Add("Last-Modified", lastModified)
w.WriteHeader(http.StatusOK)
fmt.Fprintf(w, "ok\n")
return nil
case "manifest.json":
case metadataPath == "manifest.json":
// metadata requests require authorization to avoid making pushes from private
// repositories enumerable
_, err := AuthorizeMetadataRetrieval(r)
@@ -177,12 +179,42 @@ func getPage(w http.ResponseWriter, r *http.Request) error {
w.Header().Add("Last-Modified", lastModified)
w.WriteHeader(http.StatusOK)
w.Write([]byte(ManifestDebugJSON(manifest)))
return nil
case metadataPath == "archive.tar" && config.Feature("archive-site"):
// same as above
_, err := AuthorizeMetadataRetrieval(r)
if err != nil {
return err
}
// we only offer `/.git-pages/archive.tar` and not the `.tar.gz`/`.tar.zst` variants
// because HTTP can already request compression using the `Content-Encoding` mechanism
acceptedEncodings := parseHTTPEncodings(r.Header.Get("Accept-Encoding"))
negotiated := acceptedEncodings.Negotiate("zstd", "gzip", "identity")
if negotiated != "" {
w.Header().Set("Content-Encoding", negotiated)
}
w.Header().Add("Content-Type", "application/x-tar")
w.Header().Add("Last-Modified", lastModified)
w.Header().Add("Transfer-Encoding", "chunked")
w.WriteHeader(http.StatusOK)
var iow io.Writer
switch negotiated {
case "", "identity":
iow = w
case "gzip":
iow = gzip.NewWriter(w)
case "zstd":
iow, _ = zstd.NewWriter(w)
}
return CollectTar(r.Context(), iow, manifest, manifestMtime)
default:
w.WriteHeader(http.StatusNotFound)
fmt.Fprintf(w, "not found\n")
return nil
}
return nil
}
entryPath := sitePath
@@ -297,6 +329,8 @@ func getPage(w http.ResponseWriter, r *http.Request) error {
default:
negotiatedEncoding = false
}
default:
return fmt.Errorf("unexpected transform")
}
if !negotiatedEncoding {
w.WriteHeader(http.StatusNotAcceptable)

View File

@@ -11,7 +11,7 @@ import (
"google.golang.org/protobuf/proto"
)
const redirectsFileName string = "_redirects"
const RedirectsFileName string = "_redirects"
func unparseRule(rule redirects.Rule) string {
var statusPart string
@@ -87,24 +87,24 @@ func validateRedirectRule(rule redirects.Rule) error {
// Parses redirects file and injects rules into the manifest.
func ProcessRedirectsFile(manifest *Manifest) error {
redirectsEntry := manifest.Contents[redirectsFileName]
delete(manifest.Contents, redirectsFileName)
redirectsEntry := manifest.Contents[RedirectsFileName]
delete(manifest.Contents, RedirectsFileName)
if redirectsEntry == nil {
return nil
} else if redirectsEntry.GetType() != Type_InlineFile {
return AddProblem(manifest, redirectsFileName,
return AddProblem(manifest, RedirectsFileName,
"not a regular file")
}
rules, err := redirects.ParseString(string(redirectsEntry.GetData()))
if err != nil {
return AddProblem(manifest, redirectsFileName,
return AddProblem(manifest, RedirectsFileName,
"syntax error: %s", err)
}
for index, rule := range rules {
if err := validateRedirectRule(rule); err != nil {
AddProblem(manifest, redirectsFileName,
AddProblem(manifest, RedirectsFileName,
"rule #%d %q: %s", index+1, unparseRule(rule), err)
continue
}
@@ -118,6 +118,19 @@ func ProcessRedirectsFile(manifest *Manifest) error {
return nil
}
func CollectRedirectsFile(manifest *Manifest) string {
var rules []string
for _, rule := range manifest.GetRedirects() {
rules = append(rules, unparseRule(redirects.Rule{
From: rule.GetFrom(),
To: rule.GetTo(),
Status: int(rule.GetStatus()),
Force: rule.GetForce(),
})+"\n")
}
return strings.Join(rules, "")
}
func pathSegments(path string) []string {
return strings.Split(strings.TrimPrefix(path, "/"), "/")
}