try and fetch from github/gitlab/tangled READMEs

This commit is contained in:
Evan Jarrett
2025-12-20 16:00:15 -06:00
parent e8e375639d
commit 24b265bf12
9 changed files with 684 additions and 19 deletions

View File

@@ -1,6 +1,6 @@
# Production build for ATCR AppView
# Result: ~30MB scratch image with static binary
FROM docker.io/golang:1.25.2-trixie AS builder
FROM docker.io/golang:1.25.4-trixie AS builder
ENV DEBIAN_FRONTEND=noninteractive
@@ -34,12 +34,12 @@ EXPOSE 5000
LABEL org.opencontainers.image.title="ATCR AppView" \
org.opencontainers.image.description="ATProto Container Registry - OCI-compliant registry using AT Protocol for manifest storage" \
org.opencontainers.image.authors="ATCR Contributors" \
org.opencontainers.image.source="https://tangled.org/@evan.jarrett.net/at-container-registry" \
org.opencontainers.image.documentation="https://tangled.org/@evan.jarrett.net/at-container-registry" \
org.opencontainers.image.source="https://tangled.org/evan.jarrett.net/at-container-registry" \
org.opencontainers.image.documentation="https://tangled.org/evan.jarrett.net/at-container-registry" \
org.opencontainers.image.licenses="MIT" \
org.opencontainers.image.version="0.1.0" \
io.atcr.icon="https://imgs.blue/evan.jarrett.net/1TpTNrRelfloN2emuWZDrWmPT0o93bAjEnozjD6UPgoVV9m4" \
io.atcr.readme="https://tangled.org/@evan.jarrett.net/at-container-registry/raw/main/docs/appview.md"
io.atcr.readme="https://tangled.org/evan.jarrett.net/at-container-registry/raw/main/docs/appview.md"
ENTRYPOINT ["/atcr-appview"]
CMD ["serve"]

View File

@@ -1,7 +1,7 @@
# Development image with Air hot reload
# Build: docker build -f Dockerfile.dev -t atcr-appview-dev .
# Run: docker run -v $(pwd):/app -p 5000:5000 atcr-appview-dev
FROM docker.io/golang:1.25.2-trixie
FROM docker.io/golang:1.25.4-trixie
ENV DEBIAN_FRONTEND=noninteractive

View File

@@ -1,4 +1,4 @@
FROM docker.io/golang:1.25.2-trixie AS builder
FROM docker.io/golang:1.25.4-trixie AS builder
ENV DEBIAN_FRONTEND=noninteractive
@@ -38,11 +38,11 @@ EXPOSE 8080
LABEL org.opencontainers.image.title="ATCR Hold Service" \
org.opencontainers.image.description="ATCR Hold Service - Bring Your Own Storage component for ATCR" \
org.opencontainers.image.authors="ATCR Contributors" \
org.opencontainers.image.source="https://tangled.org/@evan.jarrett.net/at-container-registry" \
org.opencontainers.image.documentation="https://tangled.org/@evan.jarrett.net/at-container-registry" \
org.opencontainers.image.source="https://tangled.org/evan.jarrett.net/at-container-registry" \
org.opencontainers.image.documentation="https://tangled.org/evan.jarrett.net/at-container-registry" \
org.opencontainers.image.licenses="MIT" \
org.opencontainers.image.version="0.1.0" \
io.atcr.icon="https://imgs.blue/evan.jarrett.net/1TpTOdtS60GdJWBYEqtK22y688jajbQ9a5kbYRFtwuqrkBAE" \
io.atcr.readme="https://tangled.org/@evan.jarrett.net/at-container-registry/raw/main/docs/hold.md"
io.atcr.readme="https://tangled.org/evan.jarrett.net/at-container-registry/raw/main/docs/hold.md"
ENTRYPOINT ["/atcr-hold"]

View File

@@ -192,17 +192,26 @@ func (h *RepositoryPageHandler) ServeHTTP(w http.ResponseWriter, r *http.Request
// Fetch README content if available
var readmeHTML template.HTML
if repo.ReadmeURL != "" && h.ReadmeCache != nil {
// Fetch with timeout
if h.ReadmeCache != nil {
ctx, cancel := context.WithTimeout(r.Context(), 5*time.Second)
defer cancel()
html, err := h.ReadmeCache.Get(ctx, repo.ReadmeURL)
if err != nil {
slog.Warn("Failed to fetch README", "url", repo.ReadmeURL, "error", err)
// Continue without README on error
} else {
readmeHTML = template.HTML(html)
if repo.ReadmeURL != "" {
// Explicit io.atcr.readme takes priority
html, err := h.ReadmeCache.Get(ctx, repo.ReadmeURL)
if err != nil {
slog.Warn("Failed to fetch README", "url", repo.ReadmeURL, "error", err)
} else {
readmeHTML = template.HTML(html)
}
} else if repo.SourceURL != "" {
// Derive README from org.opencontainers.image.source
html, err := h.ReadmeCache.GetFromSource(ctx, repo.SourceURL)
if err != nil {
slog.Debug("Failed to derive README from source", "url", repo.SourceURL, "error", err)
} else if html != "" {
readmeHTML = template.HTML(html)
}
}
}

View File

@@ -11,6 +11,13 @@ import (
"time"
)
const (
// negativeCacheTTL is the TTL for negative cache entries (no README found)
negativeCacheTTL = 15 * time.Minute
// sourceCachePrefix is the prefix for source-derived cache keys
sourceCachePrefix = "source:"
)
// Cache stores rendered README HTML in the database
type Cache struct {
db *sql.DB
@@ -60,6 +67,63 @@ func (c *Cache) Get(ctx context.Context, readmeURL string) (string, error) {
return html, nil
}
// GetFromSource fetches a README by deriving the URL from a source repository URL.
// It tries main branch first, then falls back to master if 404.
// Returns empty string if no README found (cached as negative result with shorter TTL).
func (c *Cache) GetFromSource(ctx context.Context, sourceURL string) (string, error) {
cacheKey := sourceCachePrefix + sourceURL
// Try to get from cache
html, fetchedAt, err := c.getFromDB(cacheKey)
if err == nil {
// Determine TTL based on whether this is a negative cache entry
ttl := c.ttl
if html == "" {
ttl = negativeCacheTTL
}
if time.Since(fetchedAt) < ttl {
return html, nil
}
}
// Derive README URL and fetch
// Try main branch first
readmeURL := DeriveReadmeURL(sourceURL, "main")
if readmeURL == "" {
return "", nil // Unsupported platform, don't cache
}
html, err = c.fetcher.FetchAndRender(ctx, readmeURL)
if err != nil {
if Is404(err) {
// Try master branch
readmeURL = DeriveReadmeURL(sourceURL, "master")
html, err = c.fetcher.FetchAndRender(ctx, readmeURL)
if err != nil {
if Is404(err) {
// No README on either branch - cache negative result
if cacheErr := c.storeInDB(cacheKey, ""); cacheErr != nil {
slog.Warn("Failed to cache negative README result", "error", cacheErr)
}
return "", nil
}
// Other error (network, etc.) - don't cache, allow retry
return "", err
}
} else {
// Other error (network, etc.) - don't cache, allow retry
return "", err
}
}
// Store successful result in cache
if err := c.storeInDB(cacheKey, html); err != nil {
slog.Warn("Failed to cache README from source", "error", err)
}
return html, nil
}
// getFromDB retrieves cached README from database
func (c *Cache) getFromDB(readmeURL string) (string, time.Time, error) {
var html string

View File

@@ -1,6 +1,14 @@
package readme
import "testing"
import (
"context"
"database/sql"
"fmt"
"testing"
"time"
_ "github.com/mattn/go-sqlite3"
)
func TestCache_Struct(t *testing.T) {
// Simple struct test
@@ -10,4 +18,239 @@ func TestCache_Struct(t *testing.T) {
}
}
// TODO: Add cache operation tests
func setupTestDB(t *testing.T) *sql.DB {
t.Helper()
db, err := sql.Open("sqlite3", ":memory:")
if err != nil {
t.Fatalf("Failed to open database: %v", err)
}
// Create the readme_cache table
_, err = db.Exec(`
CREATE TABLE readme_cache (
url TEXT PRIMARY KEY,
html TEXT NOT NULL,
fetched_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP
)
`)
if err != nil {
t.Fatalf("Failed to create table: %v", err)
}
return db
}
func TestGetFromSource_UnsupportedPlatform(t *testing.T) {
db := setupTestDB(t)
defer db.Close()
cache := NewCache(db, time.Hour)
ctx := context.Background()
// Unsupported platform should return empty, no error
html, err := cache.GetFromSource(ctx, "https://bitbucket.org/user/repo")
if err != nil {
t.Errorf("Expected no error for unsupported platform, got: %v", err)
}
if html != "" {
t.Errorf("Expected empty string for unsupported platform, got: %q", html)
}
}
func TestGetFromSource_CacheHit(t *testing.T) {
db := setupTestDB(t)
defer db.Close()
cache := NewCache(db, time.Hour)
sourceURL := "https://github.com/test/repo"
cacheKey := sourceCachePrefix + sourceURL
expectedHTML := "<h1>Cached Content</h1>"
// Pre-populate cache
_, err := db.Exec(`
INSERT INTO readme_cache (url, html, fetched_at)
VALUES (?, ?, ?)
`, cacheKey, expectedHTML, time.Now())
if err != nil {
t.Fatalf("Failed to insert cache: %v", err)
}
ctx := context.Background()
html, err := cache.GetFromSource(ctx, sourceURL)
if err != nil {
t.Errorf("Expected no error, got: %v", err)
}
if html != expectedHTML {
t.Errorf("Expected %q, got %q", expectedHTML, html)
}
}
func TestGetFromSource_CacheExpired(t *testing.T) {
db := setupTestDB(t)
defer db.Close()
cache := NewCache(db, time.Millisecond) // Very short TTL
sourceURL := "https://github.com/test/repo"
cacheKey := sourceCachePrefix + sourceURL
oldHTML := "<h1>Old Content</h1>"
// Pre-populate cache with old timestamp
_, err := db.Exec(`
INSERT INTO readme_cache (url, html, fetched_at)
VALUES (?, ?, ?)
`, cacheKey, oldHTML, time.Now().Add(-time.Hour))
if err != nil {
t.Fatalf("Failed to insert cache: %v", err)
}
ctx := context.Background()
// With expired cache and no network (GitHub won't respond), we expect an error
// but the function should try to fetch
_, err = cache.GetFromSource(ctx, sourceURL)
// We expect an error because we can't actually fetch from GitHub in tests
// The important thing is that it tried to fetch (didn't return cached content)
if err == nil {
t.Log("Note: GetFromSource returned no error - cache was expired and fetch was attempted")
}
}
func TestGetFromSource_NegativeCache(t *testing.T) {
db := setupTestDB(t)
defer db.Close()
cache := NewCache(db, time.Hour)
sourceURL := "https://github.com/test/repo"
cacheKey := sourceCachePrefix + sourceURL
// Pre-populate cache with empty string (negative cache)
_, err := db.Exec(`
INSERT INTO readme_cache (url, html, fetched_at)
VALUES (?, ?, ?)
`, cacheKey, "", time.Now())
if err != nil {
t.Fatalf("Failed to insert cache: %v", err)
}
ctx := context.Background()
html, err := cache.GetFromSource(ctx, sourceURL)
if err != nil {
t.Errorf("Expected no error for negative cache hit, got: %v", err)
}
if html != "" {
t.Errorf("Expected empty string for negative cache hit, got: %q", html)
}
}
func TestGetFromSource_NegativeCacheExpired(t *testing.T) {
db := setupTestDB(t)
defer db.Close()
cache := NewCache(db, time.Hour)
sourceURL := "https://github.com/test/repo"
cacheKey := sourceCachePrefix + sourceURL
// Pre-populate cache with expired negative cache (older than negativeCacheTTL)
_, err := db.Exec(`
INSERT INTO readme_cache (url, html, fetched_at)
VALUES (?, ?, ?)
`, cacheKey, "", time.Now().Add(-30*time.Minute)) // 30 min ago, negative TTL is 15 min
if err != nil {
t.Fatalf("Failed to insert cache: %v", err)
}
ctx := context.Background()
// With expired negative cache, it should try to fetch again
_, err = cache.GetFromSource(ctx, sourceURL)
// We expect an error because we can't actually fetch from GitHub
// The important thing is that it tried (didn't return empty from expired negative cache)
if err == nil {
t.Log("Note: GetFromSource attempted refetch after negative cache expired")
}
}
func TestGetFromSource_EmptyURL(t *testing.T) {
db := setupTestDB(t)
defer db.Close()
cache := NewCache(db, time.Hour)
ctx := context.Background()
html, err := cache.GetFromSource(ctx, "")
if err != nil {
t.Errorf("Expected no error for empty URL, got: %v", err)
}
if html != "" {
t.Errorf("Expected empty string for empty URL, got: %q", html)
}
}
func TestGetFromSource_UnsupportedPlatforms(t *testing.T) {
db := setupTestDB(t)
defer db.Close()
cache := NewCache(db, time.Hour)
ctx := context.Background()
unsupportedURLs := []string{
"https://bitbucket.org/user/repo",
"https://sourcehut.org/user/repo",
"https://codeberg.org/user/repo",
"ftp://github.com/user/repo",
"not-a-url",
}
for _, url := range unsupportedURLs {
html, err := cache.GetFromSource(ctx, url)
if err != nil {
t.Errorf("Expected no error for unsupported URL %q, got: %v", url, err)
}
if html != "" {
t.Errorf("Expected empty string for unsupported URL %q, got: %q", url, html)
}
}
}
func TestIs404(t *testing.T) {
tests := []struct {
name string
err error
want bool
}{
{
name: "nil error",
err: nil,
want: false,
},
{
name: "404 error",
err: fmt.Errorf("unexpected status code: 404"),
want: true,
},
{
name: "404 error with context",
err: fmt.Errorf("failed to fetch: unexpected status code: 404"),
want: true,
},
{
name: "500 error",
err: fmt.Errorf("unexpected status code: 500"),
want: false,
},
{
name: "network error",
err: fmt.Errorf("connection refused"),
want: false,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
got := Is404(tt.err)
if got != tt.want {
t.Errorf("Is404(%v) = %v, want %v", tt.err, got, tt.want)
}
})
}
}

View File

@@ -180,6 +180,11 @@ func getBaseURL(u *url.URL) string {
return fmt.Sprintf("%s://%s%s", u.Scheme, u.Host, path)
}
// Is404 returns true if the error indicates a 404 Not Found response
func Is404(err error) bool {
return err != nil && strings.Contains(err.Error(), "unexpected status code: 404")
}
// rewriteRelativeURLs converts relative URLs to absolute URLs
func rewriteRelativeURLs(html, baseURL string) string {
if baseURL == "" {

View File

@@ -0,0 +1,103 @@
package readme
import (
"fmt"
"net/url"
"strings"
)
// Platform represents a supported Git hosting platform
type Platform string
const (
PlatformGitHub Platform = "github"
PlatformGitLab Platform = "gitlab"
PlatformTangled Platform = "tangled"
)
// ParseSourceURL extracts platform, user, and repo from a source repository URL.
// Returns ok=false if the URL is not a recognized pattern.
func ParseSourceURL(sourceURL string) (platform Platform, user, repo string, ok bool) {
if sourceURL == "" {
return "", "", "", false
}
parsed, err := url.Parse(sourceURL)
if err != nil {
return "", "", "", false
}
// Normalize: remove trailing slash and .git suffix
path := strings.TrimSuffix(parsed.Path, "/")
path = strings.TrimSuffix(path, ".git")
path = strings.TrimPrefix(path, "/")
if path == "" {
return "", "", "", false
}
host := strings.ToLower(parsed.Host)
switch {
case host == "github.com":
// GitHub: github.com/{user}/{repo}
parts := strings.SplitN(path, "/", 3)
if len(parts) < 2 || parts[0] == "" || parts[1] == "" {
return "", "", "", false
}
return PlatformGitHub, parts[0], parts[1], true
case host == "gitlab.com":
// GitLab: gitlab.com/{user}/{repo} or gitlab.com/{group}/{subgroup}/{repo}
// For nested groups, user = everything except last part, repo = last part
lastSlash := strings.LastIndex(path, "/")
if lastSlash == -1 || lastSlash == 0 {
return "", "", "", false
}
user = path[:lastSlash]
repo = path[lastSlash+1:]
if user == "" || repo == "" {
return "", "", "", false
}
return PlatformGitLab, user, repo, true
case host == "tangled.org" || host == "tangled.sh":
// Tangled: tangled.org/{user}/{repo} or tangled.sh/@{user}/{repo} (legacy)
// Strip leading @ from user if present
path = strings.TrimPrefix(path, "@")
parts := strings.SplitN(path, "/", 3)
if len(parts) < 2 || parts[0] == "" || parts[1] == "" {
return "", "", "", false
}
return PlatformTangled, parts[0], parts[1], true
default:
return "", "", "", false
}
}
// DeriveReadmeURL converts a source repository URL to a raw README URL.
// Returns empty string if platform is not supported.
func DeriveReadmeURL(sourceURL, branch string) string {
platform, user, repo, ok := ParseSourceURL(sourceURL)
if !ok {
return ""
}
switch platform {
case PlatformGitHub:
// https://raw.githubusercontent.com/{user}/{repo}/refs/heads/{branch}/README.md
return fmt.Sprintf("https://raw.githubusercontent.com/%s/%s/refs/heads/%s/README.md", user, repo, branch)
case PlatformGitLab:
// https://gitlab.com/{user}/{repo}/-/raw/{branch}/README.md
return fmt.Sprintf("https://gitlab.com/%s/%s/-/raw/%s/README.md", user, repo, branch)
case PlatformTangled:
// https://tangled.org/{user}/{repo}/raw/{branch}/README.md
return fmt.Sprintf("https://tangled.org/%s/%s/raw/%s/README.md", user, repo, branch)
default:
return ""
}
}

View File

@@ -0,0 +1,241 @@
package readme
import (
"testing"
)
func TestParseSourceURL(t *testing.T) {
tests := []struct {
name string
sourceURL string
wantPlatform Platform
wantUser string
wantRepo string
wantOK bool
}{
// GitHub
{
name: "github standard",
sourceURL: "https://github.com/bigmoves/quickslice",
wantPlatform: PlatformGitHub,
wantUser: "bigmoves",
wantRepo: "quickslice",
wantOK: true,
},
{
name: "github with .git suffix",
sourceURL: "https://github.com/user/repo.git",
wantPlatform: PlatformGitHub,
wantUser: "user",
wantRepo: "repo",
wantOK: true,
},
{
name: "github with trailing slash",
sourceURL: "https://github.com/user/repo/",
wantPlatform: PlatformGitHub,
wantUser: "user",
wantRepo: "repo",
wantOK: true,
},
{
name: "github with subpath (ignored)",
sourceURL: "https://github.com/user/repo/tree/main",
wantPlatform: PlatformGitHub,
wantUser: "user",
wantRepo: "repo",
wantOK: true,
},
{
name: "github user only",
sourceURL: "https://github.com/user",
wantOK: false,
},
// GitLab
{
name: "gitlab standard",
sourceURL: "https://gitlab.com/user/repo",
wantPlatform: PlatformGitLab,
wantUser: "user",
wantRepo: "repo",
wantOK: true,
},
{
name: "gitlab nested groups",
sourceURL: "https://gitlab.com/group/subgroup/repo",
wantPlatform: PlatformGitLab,
wantUser: "group/subgroup",
wantRepo: "repo",
wantOK: true,
},
{
name: "gitlab deep nested groups",
sourceURL: "https://gitlab.com/a/b/c/d/repo",
wantPlatform: PlatformGitLab,
wantUser: "a/b/c/d",
wantRepo: "repo",
wantOK: true,
},
{
name: "gitlab with .git suffix",
sourceURL: "https://gitlab.com/user/repo.git",
wantPlatform: PlatformGitLab,
wantUser: "user",
wantRepo: "repo",
wantOK: true,
},
// Tangled
{
name: "tangled standard",
sourceURL: "https://tangled.org/evan.jarrett.net/at-container-registry",
wantPlatform: PlatformTangled,
wantUser: "evan.jarrett.net",
wantRepo: "at-container-registry",
wantOK: true,
},
{
name: "tangled with legacy @ prefix",
sourceURL: "https://tangled.org/@evan.jarrett.net/at-container-registry",
wantPlatform: PlatformTangled,
wantUser: "evan.jarrett.net",
wantRepo: "at-container-registry",
wantOK: true,
},
{
name: "tangled.sh domain",
sourceURL: "https://tangled.sh/user/repo",
wantPlatform: PlatformTangled,
wantUser: "user",
wantRepo: "repo",
wantOK: true,
},
{
name: "tangled with trailing slash",
sourceURL: "https://tangled.org/user/repo/",
wantPlatform: PlatformTangled,
wantUser: "user",
wantRepo: "repo",
wantOK: true,
},
// Unsupported / Invalid
{
name: "unsupported platform",
sourceURL: "https://bitbucket.org/user/repo",
wantOK: false,
},
{
name: "empty url",
sourceURL: "",
wantOK: false,
},
{
name: "invalid url",
sourceURL: "not-a-url",
wantOK: false,
},
{
name: "just host",
sourceURL: "https://github.com",
wantOK: false,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
platform, user, repo, ok := ParseSourceURL(tt.sourceURL)
if ok != tt.wantOK {
t.Errorf("ParseSourceURL(%q) ok = %v, want %v", tt.sourceURL, ok, tt.wantOK)
return
}
if !tt.wantOK {
return
}
if platform != tt.wantPlatform {
t.Errorf("ParseSourceURL(%q) platform = %v, want %v", tt.sourceURL, platform, tt.wantPlatform)
}
if user != tt.wantUser {
t.Errorf("ParseSourceURL(%q) user = %q, want %q", tt.sourceURL, user, tt.wantUser)
}
if repo != tt.wantRepo {
t.Errorf("ParseSourceURL(%q) repo = %q, want %q", tt.sourceURL, repo, tt.wantRepo)
}
})
}
}
func TestDeriveReadmeURL(t *testing.T) {
tests := []struct {
name string
sourceURL string
branch string
want string
}{
// GitHub
{
name: "github main",
sourceURL: "https://github.com/bigmoves/quickslice",
branch: "main",
want: "https://raw.githubusercontent.com/bigmoves/quickslice/refs/heads/main/README.md",
},
{
name: "github master",
sourceURL: "https://github.com/user/repo",
branch: "master",
want: "https://raw.githubusercontent.com/user/repo/refs/heads/master/README.md",
},
// GitLab
{
name: "gitlab main",
sourceURL: "https://gitlab.com/user/repo",
branch: "main",
want: "https://gitlab.com/user/repo/-/raw/main/README.md",
},
{
name: "gitlab nested groups",
sourceURL: "https://gitlab.com/group/subgroup/repo",
branch: "main",
want: "https://gitlab.com/group/subgroup/repo/-/raw/main/README.md",
},
// Tangled
{
name: "tangled main",
sourceURL: "https://tangled.org/evan.jarrett.net/at-container-registry",
branch: "main",
want: "https://tangled.org/evan.jarrett.net/at-container-registry/raw/main/README.md",
},
{
name: "tangled legacy @ prefix",
sourceURL: "https://tangled.org/@user/repo",
branch: "main",
want: "https://tangled.org/user/repo/raw/main/README.md",
},
// Unsupported
{
name: "unsupported platform",
sourceURL: "https://bitbucket.org/user/repo",
branch: "main",
want: "",
},
{
name: "empty url",
sourceURL: "",
branch: "main",
want: "",
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
got := DeriveReadmeURL(tt.sourceURL, tt.branch)
if got != tt.want {
t.Errorf("DeriveReadmeURL(%q, %q) = %q, want %q", tt.sourceURL, tt.branch, got, tt.want)
}
})
}
}