From 25b1720940054792e70472a8457d1968cac2b5fe Mon Sep 17 00:00:00 2001 From: Catherine Date: Mon, 29 Sep 2025 00:30:57 +0000 Subject: [PATCH] Compress files with Zstandard. This can save as much as 30% of storage space while adding negligible CPU overhead. --- src/manifest.go | 27 ++++++++++++ src/pages.go | 22 ++++++++++ src/schema.pb.go | 108 ++++++++++++++++++++++++++++++++++++++--------- src/schema.proto | 13 +++++- 4 files changed, 148 insertions(+), 22 deletions(-) diff --git a/src/manifest.go b/src/manifest.go index 296728b..50278d9 100644 --- a/src/manifest.go +++ b/src/manifest.go @@ -13,6 +13,7 @@ import ( "sync" "github.com/c2h5oh/datasize" + "github.com/klauspost/compress/zstd" "google.golang.org/protobuf/encoding/protojson" "google.golang.org/protobuf/proto" ) @@ -120,6 +121,29 @@ again: } } +// Compress contents of inline files. +func CompressFiles(manifest *Manifest) { + var originalSize, transformedSize uint32 + var encoder, _ = zstd.NewWriter(nil, zstd.WithEncoderLevel(zstd.SpeedBetterCompression)) + for _, entry := range manifest.Contents { + if entry.GetType() == Type_InlineFile && entry.GetXfrm() == Transform_None { + originalSize += entry.GetSize() + compressedData := encoder.EncodeAll(entry.GetData(), make([]byte, 0, entry.GetSize())) + if len(compressedData) < int(*entry.Size) { + entry.Data = compressedData + entry.Size = proto.Uint32(uint32(len(entry.Data))) + entry.Xfrm = Transform_Zstandard.Enum() + } + transformedSize += entry.GetSize() + } + } + log.Printf("compress: saved %.2f%% (%s to %s)", + (float32(originalSize)-float32(transformedSize))/float32(originalSize)*100.0, + datasize.ByteSize(originalSize).HR(), + datasize.ByteSize(transformedSize).HR(), + ) +} + // Apply post-processing steps to the manifest. // At the moment, there isn't a good way to report errors except to log them on the terminal. // (Perhaps in the future they could be exposed at `.git-pages/status.txt`?) @@ -131,6 +155,8 @@ func PrepareManifest(manifest *Manifest) error { log.Printf("redirects ok: %d rules\n", len(manifest.Redirects)) } + CompressFiles(manifest) + return nil } @@ -157,6 +183,7 @@ func StoreManifest(name string, manifest *Manifest) (*Manifest, error) { Type: Type_ExternalFile.Enum(), Size: entry.Size, Data: fmt.Appendf(nil, "sha256-%x", sha256.Sum256(entry.Data)), + Xfrm: entry.Xfrm, } } else { extManifest.Contents[name] = entry diff --git a/src/pages.go b/src/pages.go index b091c3c..6476da2 100644 --- a/src/pages.go +++ b/src/pages.go @@ -16,6 +16,7 @@ import ( "strings" "time" + "github.com/klauspost/compress/zstd" "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/promauto" ) @@ -60,6 +61,10 @@ func reportSiteUpdate(via string, result *UpdateResult) { } } +// The `clauspost/compress/zstd` package recommends reusing a decompressor to avoid repeated +// allocations of internal buffers. +var zstdDecoder, _ = zstd.NewReader(nil) + func getPage(w http.ResponseWriter, r *http.Request) error { var err error var sitePath string @@ -198,6 +203,23 @@ func getPage(w http.ResponseWriter, r *http.Request) error { defer closer.Close() } + switch entry.GetXfrm() { + case Transform_None: + // nothing to do + case Transform_Zstandard: + // Ideally, we would serve zstd-compressed data to a client that indicates support with + // an `Accept-Encoding: zstd` header. Unfortunately we can't because we rely on MIME + // type detection done in `http.ServeContent`. + compressedData, _ := io.ReadAll(reader) + decompressedData, err := zstdDecoder.DecodeAll(compressedData, []byte{}) + if err != nil { + w.WriteHeader(http.StatusInternalServerError) + fmt.Fprintf(w, "internal server error: %s\n", err) + return err + } + reader = bytes.NewReader(decompressedData) + } + // decide on the HTTP status if status != 200 { w.WriteHeader(status) diff --git a/src/schema.pb.go b/src/schema.pb.go index f89ac7a..a90d456 100644 --- a/src/schema.pb.go +++ b/src/schema.pb.go @@ -81,10 +81,59 @@ func (Type) EnumDescriptor() ([]byte, []int) { return file_schema_proto_rawDescGZIP(), []int{0} } +type Transform int32 + +const ( + // No transformation. + Transform_None Transform = 0 + // Zstandard compression. + Transform_Zstandard Transform = 1 +) + +// Enum value maps for Transform. +var ( + Transform_name = map[int32]string{ + 0: "None", + 1: "Zstandard", + } + Transform_value = map[string]int32{ + "None": 0, + "Zstandard": 1, + } +) + +func (x Transform) Enum() *Transform { + p := new(Transform) + *p = x + return p +} + +func (x Transform) String() string { + return protoimpl.X.EnumStringOf(x.Descriptor(), protoreflect.EnumNumber(x)) +} + +func (Transform) Descriptor() protoreflect.EnumDescriptor { + return file_schema_proto_enumTypes[1].Descriptor() +} + +func (Transform) Type() protoreflect.EnumType { + return &file_schema_proto_enumTypes[1] +} + +func (x Transform) Number() protoreflect.EnumNumber { + return protoreflect.EnumNumber(x) +} + +// Deprecated: Use Transform.Descriptor instead. +func (Transform) EnumDescriptor() ([]byte, []int) { + return file_schema_proto_rawDescGZIP(), []int{1} +} + type Entry struct { state protoimpl.MessageState `protogen:"open.v1"` Type *Type `protobuf:"varint,1,opt,name=type,enum=Type" json:"type,omitempty"` - // Only present for `type == InlineFile` and `type == ExternalFile` + // Only present for `type == InlineFile` and `type == ExternalFile`. + // For transformed entries, refers to the post-transformation (compressed) size. Size *uint32 `protobuf:"varint,2,opt,name=size" json:"size,omitempty"` // Meaning depends on `type`: // - If `type == InlineFile`, contains file data. @@ -92,7 +141,10 @@ type Entry struct { // cryptographically secure content hash). // - If `type == Symlink`, contains link target. // - Otherwise not present. - Data []byte `protobuf:"bytes,3,opt,name=data" json:"data,omitempty"` + Data []byte `protobuf:"bytes,3,opt,name=data" json:"data,omitempty"` + // Only present for `type == InlineFile` and `type == ExternalFile` that + // have been transformed. + Xfrm *Transform `protobuf:"varint,4,opt,name=xfrm,enum=Transform" json:"xfrm,omitempty"` unknownFields protoimpl.UnknownFields sizeCache protoimpl.SizeCache } @@ -148,6 +200,13 @@ func (x *Entry) GetData() []byte { return nil } +func (x *Entry) GetXfrm() Transform { + if x != nil && x.Xfrm != nil { + return *x.Xfrm + } + return Transform_None +} + // See https://docs.netlify.com/manage/routing/redirects/overview/ for details. // Only a subset of the Netlify specification is representable here. type Redirect struct { @@ -370,11 +429,13 @@ var File_schema_proto protoreflect.FileDescriptor const file_schema_proto_rawDesc = "" + "\n" + - "\fschema.proto\"J\n" + + "\fschema.proto\"j\n" + "\x05Entry\x12\x19\n" + "\x04type\x18\x01 \x01(\x0e2\x05.TypeR\x04type\x12\x12\n" + "\x04size\x18\x02 \x01(\rR\x04size\x12\x12\n" + - "\x04data\x18\x03 \x01(\fR\x04data\"\\\n" + + "\x04data\x18\x03 \x01(\fR\x04data\x12\x1e\n" + + "\x04xfrm\x18\x04 \x01(\x0e2\n" + + ".TransformR\x04xfrm\"\\\n" + "\bRedirect\x12\x12\n" + "\x04from\x18\x01 \x01(\tR\x04from\x12\x0e\n" + "\x02to\x18\x02 \x01(\tR\x02to\x12\x16\n" + @@ -401,7 +462,10 @@ const file_schema_proto_rawDesc = "" + "\n" + "InlineFile\x10\x02\x12\x10\n" + "\fExternalFile\x10\x03\x12\v\n" + - "\aSymlink\x10\x04B'Z%codeberg.org/git-pages/git-pages/mainb\beditionsp\xe8\a" + "\aSymlink\x10\x04*$\n" + + "\tTransform\x12\b\n" + + "\x04None\x10\x00\x12\r\n" + + "\tZstandard\x10\x01B'Z%codeberg.org/git-pages/git-pages/mainb\beditionsp\xe8\a" var ( file_schema_proto_rawDescOnce sync.Once @@ -415,27 +479,29 @@ func file_schema_proto_rawDescGZIP() []byte { return file_schema_proto_rawDescData } -var file_schema_proto_enumTypes = make([]protoimpl.EnumInfo, 1) +var file_schema_proto_enumTypes = make([]protoimpl.EnumInfo, 2) var file_schema_proto_msgTypes = make([]protoimpl.MessageInfo, 5) var file_schema_proto_goTypes = []any{ (Type)(0), // 0: Type - (*Entry)(nil), // 1: Entry - (*Redirect)(nil), // 2: Redirect - (*Problem)(nil), // 3: Problem - (*Manifest)(nil), // 4: Manifest - nil, // 5: Manifest.ContentsEntry + (Transform)(0), // 1: Transform + (*Entry)(nil), // 2: Entry + (*Redirect)(nil), // 3: Redirect + (*Problem)(nil), // 4: Problem + (*Manifest)(nil), // 5: Manifest + nil, // 6: Manifest.ContentsEntry } var file_schema_proto_depIdxs = []int32{ 0, // 0: Entry.type:type_name -> Type - 5, // 1: Manifest.contents:type_name -> Manifest.ContentsEntry - 2, // 2: Manifest.redirects:type_name -> Redirect - 3, // 3: Manifest.problems:type_name -> Problem - 1, // 4: Manifest.ContentsEntry.value:type_name -> Entry - 5, // [5:5] is the sub-list for method output_type - 5, // [5:5] is the sub-list for method input_type - 5, // [5:5] is the sub-list for extension type_name - 5, // [5:5] is the sub-list for extension extendee - 0, // [0:5] is the sub-list for field type_name + 1, // 1: Entry.xfrm:type_name -> Transform + 6, // 2: Manifest.contents:type_name -> Manifest.ContentsEntry + 3, // 3: Manifest.redirects:type_name -> Redirect + 4, // 4: Manifest.problems:type_name -> Problem + 2, // 5: Manifest.ContentsEntry.value:type_name -> Entry + 6, // [6:6] is the sub-list for method output_type + 6, // [6:6] is the sub-list for method input_type + 6, // [6:6] is the sub-list for extension type_name + 6, // [6:6] is the sub-list for extension extendee + 0, // [0:6] is the sub-list for field type_name } func init() { file_schema_proto_init() } @@ -448,7 +514,7 @@ func file_schema_proto_init() { File: protoimpl.DescBuilder{ GoPackagePath: reflect.TypeOf(x{}).PkgPath(), RawDescriptor: unsafe.Slice(unsafe.StringData(file_schema_proto_rawDesc), len(file_schema_proto_rawDesc)), - NumEnums: 1, + NumEnums: 2, NumMessages: 5, NumExtensions: 0, NumServices: 0, diff --git a/src/schema.proto b/src/schema.proto index 61c4667..f8ff0d7 100644 --- a/src/schema.proto +++ b/src/schema.proto @@ -15,9 +15,17 @@ enum Type { Symlink = 4; } +enum Transform { + // No transformation. + None = 0; + // Zstandard compression. + Zstandard = 1; +} + message Entry { Type type = 1; - // Only present for `type == InlineFile` and `type == ExternalFile` + // Only present for `type == InlineFile` and `type == ExternalFile`. + // For transformed entries, refers to the post-transformation (compressed) size. uint32 size = 2; // Meaning depends on `type`: // * If `type == InlineFile`, contains file data. @@ -26,6 +34,9 @@ message Entry { // * If `type == Symlink`, contains link target. // * Otherwise not present. bytes data = 3; + // Only present for `type == InlineFile` and `type == ExternalFile` that + // have been transformed. + Transform xfrm = 4; } // See https://docs.netlify.com/manage/routing/redirects/overview/ for details.