fix issue where soft 404 pages were being rendered in readme content. always update content on push

This commit is contained in:
Evan Jarrett
2026-01-03 23:11:48 -06:00
parent 6dd612e157
commit f74bc3018a
3 changed files with 372 additions and 84 deletions

View File

@@ -70,22 +70,7 @@ func NewFetcher() *Fetcher {
// FetchAndRender fetches a README from a URL and renders it as HTML
// Returns the rendered HTML and any error
func (f *Fetcher) FetchAndRender(ctx context.Context, readmeURL string) (string, error) {
// Validate URL
if readmeURL == "" {
return "", fmt.Errorf("empty README URL")
}
parsedURL, err := url.Parse(readmeURL)
if err != nil {
return "", fmt.Errorf("invalid README URL: %w", err)
}
// Only allow HTTP/HTTPS
if parsedURL.Scheme != "http" && parsedURL.Scheme != "https" {
return "", fmt.Errorf("invalid URL scheme: %s", parsedURL.Scheme)
}
// Fetch content
// Fetch content (includes URL validation, Content-Type check, and HTML detection)
content, baseURL, err := f.fetchContent(ctx, readmeURL)
if err != nil {
return "", err
@@ -100,8 +85,36 @@ func (f *Fetcher) FetchAndRender(ctx context.Context, readmeURL string) (string,
return html, nil
}
// FetchRaw fetches raw README content from a URL without rendering
// Returns raw bytes with Content-Type and HTML validation
// Use this when you need to store the raw markdown (e.g., in PDS records)
func (f *Fetcher) FetchRaw(ctx context.Context, readmeURL string) ([]byte, error) {
// Fetch content (includes URL validation, Content-Type check, and HTML detection)
content, _, err := f.fetchContent(ctx, readmeURL)
if err != nil {
return nil, err
}
return content, nil
}
// fetchContent fetches the raw content from a URL
func (f *Fetcher) fetchContent(ctx context.Context, urlStr string) ([]byte, string, error) {
// Validate URL
if urlStr == "" {
return nil, "", fmt.Errorf("empty README URL")
}
parsedURL, err := url.Parse(urlStr)
if err != nil {
return nil, "", fmt.Errorf("invalid README URL: %w", err)
}
// Only allow HTTP/HTTPS
if parsedURL.Scheme != "http" && parsedURL.Scheme != "https" {
return nil, "", fmt.Errorf("invalid URL scheme: %s", parsedURL.Scheme)
}
req, err := http.NewRequestWithContext(ctx, "GET", urlStr, nil)
if err != nil {
return nil, "", fmt.Errorf("failed to create request: %w", err)
@@ -120,6 +133,15 @@ func (f *Fetcher) fetchContent(ctx context.Context, urlStr string) ([]byte, stri
return nil, "", fmt.Errorf("unexpected status code: %d", resp.StatusCode)
}
// Reject HTML content types (catches proper error pages)
contentType := resp.Header.Get("Content-Type")
if contentType != "" {
ct := strings.ToLower(contentType)
if strings.Contains(ct, "text/html") || strings.Contains(ct, "application/xhtml") {
return nil, "", fmt.Errorf("unsupported content type: %s (expected markdown or plain text)", contentType)
}
}
// Limit content size to 1MB
limitedReader := io.LimitReader(resp.Body, 1*1024*1024)
content, err := io.ReadAll(limitedReader)
@@ -127,12 +149,35 @@ func (f *Fetcher) fetchContent(ctx context.Context, urlStr string) ([]byte, stri
return nil, "", fmt.Errorf("failed to read response body: %w", err)
}
// Detect HTML content by checking for common markers (catches soft 404s)
if LooksLikeHTML(content) {
return nil, "", fmt.Errorf("detected HTML content instead of markdown")
}
// Get base URL for relative link resolution
baseURL := getBaseURL(resp.Request.URL)
return content, baseURL, nil
}
// LooksLikeHTML checks if content appears to be HTML rather than markdown
// Exported for use by other packages that fetch README content
func LooksLikeHTML(content []byte) bool {
if len(content) == 0 {
return false
}
// Check first 512 bytes for HTML markers
checkLen := min(len(content), 512)
trimmed := strings.TrimSpace(string(content[:checkLen]))
lower := strings.ToLower(trimmed)
return strings.HasPrefix(lower, "<!doctype") ||
strings.HasPrefix(lower, "<html") ||
strings.HasPrefix(lower, "<?xml")
}
// renderMarkdown renders markdown content to sanitized HTML
func (f *Fetcher) renderMarkdown(content []byte, baseURL string) (string, error) {
var buf bytes.Buffer

View File

@@ -1,7 +1,11 @@
package readme
import (
"context"
"net/http"
"net/http/httptest"
"net/url"
"strings"
"testing"
)
@@ -305,4 +309,270 @@ func containsSubstringHelper(s, substr string) bool {
return false
}
// TODO: Add README fetching and caching tests
func TestLooksLikeHTML(t *testing.T) {
tests := []struct {
name string
content string
expected bool
}{
{
name: "empty content",
content: "",
expected: false,
},
{
name: "markdown content",
content: "# Hello World\n\nThis is a README.",
expected: false,
},
{
name: "plain text",
content: "Just some plain text without any HTML.",
expected: false,
},
{
name: "doctype html",
content: "<!DOCTYPE html>\n<html><body>Page</body></html>",
expected: true,
},
{
name: "doctype html lowercase",
content: "<!doctype html>\n<html><body>Page</body></html>",
expected: true,
},
{
name: "html tag only",
content: "<html><head></head><body>Page</body></html>",
expected: true,
},
{
name: "html tag with whitespace",
content: " \n <html>\n<body>Page</body></html>",
expected: true,
},
{
name: "xml declaration",
content: "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<html>...</html>",
expected: true,
},
{
name: "soft 404 page",
content: "<!DOCTYPE html><html><head><title>Page Not Found</title></head><body><h1>404</h1></body></html>",
expected: true,
},
{
name: "markdown with inline html",
content: "# Title\n\nSome text with <strong>bold</strong> inline.",
expected: false,
},
{
name: "markdown starting with hash",
content: "## Section\n\nContent here.",
expected: false,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
result := LooksLikeHTML([]byte(tt.content))
if result != tt.expected {
t.Errorf("looksLikeHTML(%q) = %v, want %v", tt.content, result, tt.expected)
}
})
}
}
func TestFetcher_FetchRaw(t *testing.T) {
fetcher := NewFetcher()
tests := []struct {
name string
handler http.HandlerFunc
wantErr bool
errContains string
wantContent string
}{
{
name: "successful markdown fetch",
handler: func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "text/plain")
w.Write([]byte("# Hello World\n\nThis is markdown."))
},
wantErr: false,
wantContent: "# Hello World",
},
{
name: "rejects HTML content type",
handler: func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "text/html; charset=utf-8")
w.Write([]byte("<html><body>Error</body></html>"))
},
wantErr: true,
errContains: "unsupported content type",
},
{
name: "rejects soft 404 HTML content",
handler: func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "text/plain")
w.Write([]byte("<!DOCTYPE html><html><body>404 Not Found</body></html>"))
},
wantErr: true,
errContains: "detected HTML content",
},
{
name: "rejects 404 status",
handler: func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(http.StatusNotFound)
w.Write([]byte("Not Found"))
},
wantErr: true,
errContains: "unexpected status code: 404",
},
{
name: "rejects 500 status",
handler: func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(http.StatusInternalServerError)
w.Write([]byte("Internal Server Error"))
},
wantErr: true,
errContains: "unexpected status code: 500",
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
server := httptest.NewServer(tt.handler)
defer server.Close()
content, err := fetcher.FetchRaw(context.Background(), server.URL)
if tt.wantErr {
if err == nil {
t.Errorf("FetchRaw() expected error containing %q, got nil", tt.errContains)
return
}
if !strings.Contains(err.Error(), tt.errContains) {
t.Errorf("FetchRaw() error = %q, want error containing %q", err.Error(), tt.errContains)
}
return
}
if err != nil {
t.Errorf("FetchRaw() unexpected error: %v", err)
return
}
if !strings.Contains(string(content), tt.wantContent) {
t.Errorf("FetchRaw() content = %q, want content containing %q", string(content), tt.wantContent)
}
})
}
}
func TestFetcher_FetchRaw_URLValidation(t *testing.T) {
fetcher := NewFetcher()
tests := []struct {
name string
url string
errContains string
}{
{
name: "empty URL",
url: "",
errContains: "empty README URL",
},
{
name: "invalid URL scheme",
url: "ftp://example.com/README.md",
errContains: "invalid URL scheme",
},
{
name: "file URL scheme",
url: "file:///etc/passwd",
errContains: "invalid URL scheme",
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
_, err := fetcher.FetchRaw(context.Background(), tt.url)
if err == nil {
t.Errorf("FetchRaw(%q) expected error, got nil", tt.url)
return
}
if !strings.Contains(err.Error(), tt.errContains) {
t.Errorf("FetchRaw(%q) error = %q, want error containing %q", tt.url, err.Error(), tt.errContains)
}
})
}
}
func TestFetcher_FetchAndRender(t *testing.T) {
fetcher := NewFetcher()
tests := []struct {
name string
handler http.HandlerFunc
wantErr bool
errContains string
wantContain string
}{
{
name: "renders markdown to HTML",
handler: func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "text/plain")
w.Write([]byte("# Hello World\n\nThis is **bold** text."))
},
wantErr: false,
wantContain: "<strong>bold</strong>",
},
{
name: "rejects HTML content type",
handler: func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "text/html")
w.Write([]byte("<html><body>Error</body></html>"))
},
wantErr: true,
errContains: "unsupported content type",
},
{
name: "rejects soft 404",
handler: func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "text/plain")
w.Write([]byte("<!doctype html><html><body>Not Found</body></html>"))
},
wantErr: true,
errContains: "detected HTML content",
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
server := httptest.NewServer(tt.handler)
defer server.Close()
html, err := fetcher.FetchAndRender(context.Background(), server.URL)
if tt.wantErr {
if err == nil {
t.Errorf("FetchAndRender() expected error containing %q, got nil", tt.errContains)
return
}
if !strings.Contains(err.Error(), tt.errContains) {
t.Errorf("FetchAndRender() error = %q, want error containing %q", err.Error(), tt.errContains)
}
return
}
if err != nil {
t.Errorf("FetchAndRender() unexpected error: %v", err)
return
}
if !strings.Contains(html, tt.wantContain) {
t.Errorf("FetchAndRender() = %q, want HTML containing %q", html, tt.wantContain)
}
})
}
}

View File

@@ -424,23 +424,30 @@ func (s *ManifestStore) notifyHoldAboutManifest(ctx context.Context, manifestRec
return nil
}
// ensureRepoPage creates or updates a repo page record in the user's PDS if needed
// ensureRepoPage creates or updates a repo page record in the user's PDS
// This syncs repository metadata from manifest annotations to the io.atcr.repo.page collection
// Only creates a new record if one doesn't exist (doesn't overwrite user's custom content)
// Always updates the description on push (since users can't edit it via appview yet)
// Preserves user's avatar if they've set one via the appview
func (s *ManifestStore) ensureRepoPage(ctx context.Context, manifestRecord *atproto.ManifestRecord) {
// Check if repo page already exists (don't overwrite user's custom content)
rkey := s.ctx.Repository
_, err := s.ctx.ATProtoClient.GetRecord(ctx, atproto.RepoPageCollection, rkey)
if err == nil {
// Record already exists - don't overwrite
slog.Debug("Repo page already exists, skipping creation", "did", s.ctx.DID, "repository", s.ctx.Repository)
return
}
// Only continue if it's a "not found" error - other errors mean we should skip
if !errors.Is(err, atproto.ErrRecordNotFound) {
// Check for existing record to preserve user's avatar
var existingAvatarRef *atproto.ATProtoBlobRef
var existingRecord *atproto.RepoPageRecord
record, err := s.ctx.ATProtoClient.GetRecord(ctx, atproto.RepoPageCollection, rkey)
if err == nil && record != nil {
// Unmarshal the Value to get the RepoPageRecord
var repoPage atproto.RepoPageRecord
if unmarshalErr := json.Unmarshal(record.Value, &repoPage); unmarshalErr == nil {
existingRecord = &repoPage
existingAvatarRef = repoPage.Avatar
slog.Debug("Found existing repo page, will update", "did", s.ctx.DID, "repository", s.ctx.Repository, "hasExistingAvatar", existingAvatarRef != nil)
} else {
slog.Warn("Failed to unmarshal existing repo page", "did", s.ctx.DID, "repository", s.ctx.Repository, "error", unmarshalErr)
}
} else if err != nil && !errors.Is(err, atproto.ErrRecordNotFound) {
// Unexpected error - log and continue (will create new record)
slog.Warn("Failed to check for existing repo page", "did", s.ctx.DID, "repository", s.ctx.Repository, "error", err)
return
}
// Get annotations (may be nil if image has no OCI labels)
@@ -458,29 +465,37 @@ func (s *ManifestStore) ensureRepoPage(ctx context.Context, manifestRecord *atpr
description = annotations["org.opencontainers.image.description"]
}
// Try to fetch and upload icon from io.atcr.icon annotation
var avatarRef *atproto.ATProtoBlobRef
// Determine avatar: prefer new icon from annotations, otherwise keep existing
avatarRef := existingAvatarRef
if iconURL := annotations["io.atcr.icon"]; iconURL != "" {
avatarRef = s.fetchAndUploadIcon(ctx, iconURL)
if newAvatar := s.fetchAndUploadIcon(ctx, iconURL); newAvatar != nil {
avatarRef = newAvatar
}
}
// Create new repo page record with description and optional avatar
// Create/update repo page record with description and avatar
repoPage := atproto.NewRepoPageRecord(s.ctx.Repository, description, avatarRef)
slog.Info("Creating repo page from manifest annotations", "did", s.ctx.DID, "repository", s.ctx.Repository, "descriptionLength", len(description), "hasAvatar", avatarRef != nil)
isUpdate := existingRecord != nil
action := "Creating"
if isUpdate {
action = "Updating"
}
slog.Info(action+" repo page from manifest annotations", "did", s.ctx.DID, "repository", s.ctx.Repository, "descriptionLength", len(description), "hasAvatar", avatarRef != nil)
_, err = s.ctx.ATProtoClient.PutRecord(ctx, atproto.RepoPageCollection, rkey, repoPage)
if err != nil {
slog.Warn("Failed to create repo page", "did", s.ctx.DID, "repository", s.ctx.Repository, "error", err)
slog.Warn("Failed to "+strings.ToLower(action)+" repo page", "did", s.ctx.DID, "repository", s.ctx.Repository, "error", err)
return
}
slog.Info("Repo page created successfully", "did", s.ctx.DID, "repository", s.ctx.Repository)
slog.Info("Repo page "+strings.ToLower(action)+"d successfully", "did", s.ctx.DID, "repository", s.ctx.Repository)
}
// fetchReadmeContent attempts to fetch README content from external sources
// Priority: io.atcr.readme annotation > derived from org.opencontainers.image.source
// Returns the raw markdown content, or empty string if not available
// Uses the shared readme.Fetcher which validates Content-Type and rejects HTML content
func (s *ManifestStore) fetchReadmeContent(ctx context.Context, annotations map[string]string) string {
if s.ctx.ReadmeFetcher == nil {
return ""
@@ -492,12 +507,12 @@ func (s *ManifestStore) fetchReadmeContent(ctx context.Context, annotations map[
// Priority 1: Direct README URL from io.atcr.readme annotation
if readmeURL := annotations["io.atcr.readme"]; readmeURL != "" {
content, err := s.fetchRawReadme(fetchCtx, readmeURL)
content, err := s.ctx.ReadmeFetcher.FetchRaw(fetchCtx, readmeURL)
if err != nil {
slog.Debug("Failed to fetch README from io.atcr.readme annotation", "url", readmeURL, "error", err)
} else if content != "" {
} else if len(content) > 0 {
slog.Info("Fetched README from io.atcr.readme annotation", "url", readmeURL, "length", len(content))
return content
return string(content)
}
}
@@ -510,7 +525,7 @@ func (s *ManifestStore) fetchReadmeContent(ctx context.Context, annotations map[
continue
}
content, err := s.fetchRawReadme(fetchCtx, readmeURL)
content, err := s.ctx.ReadmeFetcher.FetchRaw(fetchCtx, readmeURL)
if err != nil {
// Only log non-404 errors (404 is expected when trying main vs master)
if !readme.Is404(err) {
@@ -519,9 +534,9 @@ func (s *ManifestStore) fetchReadmeContent(ctx context.Context, annotations map[
continue
}
if content != "" {
if len(content) > 0 {
slog.Info("Fetched README from source URL", "sourceURL", sourceURL, "branch", branch, "length", len(content))
return content
return string(content)
}
}
}
@@ -529,48 +544,6 @@ func (s *ManifestStore) fetchReadmeContent(ctx context.Context, annotations map[
return ""
}
// fetchRawReadme fetches raw markdown content from a URL
// Returns the raw markdown (not rendered HTML) for storage in the repo page record
func (s *ManifestStore) fetchRawReadme(ctx context.Context, readmeURL string) (string, error) {
// Use a simple HTTP client to fetch raw content
// We want raw markdown, not rendered HTML (the Fetcher renders to HTML)
req, err := http.NewRequestWithContext(ctx, "GET", readmeURL, nil)
if err != nil {
return "", fmt.Errorf("failed to create request: %w", err)
}
req.Header.Set("User-Agent", "ATCR-README-Fetcher/1.0")
client := &http.Client{
Timeout: 10 * time.Second,
CheckRedirect: func(req *http.Request, via []*http.Request) error {
if len(via) >= 5 {
return fmt.Errorf("too many redirects")
}
return nil
},
}
resp, err := client.Do(req)
if err != nil {
return "", fmt.Errorf("failed to fetch URL: %w", err)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
return "", fmt.Errorf("unexpected status code: %d", resp.StatusCode)
}
// Limit content size to 100KB (repo page description has 100KB limit in lexicon)
limitedReader := io.LimitReader(resp.Body, 100*1024)
content, err := io.ReadAll(limitedReader)
if err != nil {
return "", fmt.Errorf("failed to read response body: %w", err)
}
return string(content), nil
}
// fetchAndUploadIcon fetches an image from a URL and uploads it as a blob to the user's PDS
// Returns the blob reference for use in the repo page record, or nil on error
func (s *ManifestStore) fetchAndUploadIcon(ctx context.Context, iconURL string) *atproto.ATProtoBlobRef {