diff --git a/pkg/appview/readme/fetcher.go b/pkg/appview/readme/fetcher.go
index 47d81f0..fa621d3 100644
--- a/pkg/appview/readme/fetcher.go
+++ b/pkg/appview/readme/fetcher.go
@@ -70,22 +70,7 @@ func NewFetcher() *Fetcher {
// FetchAndRender fetches a README from a URL and renders it as HTML
// Returns the rendered HTML and any error
func (f *Fetcher) FetchAndRender(ctx context.Context, readmeURL string) (string, error) {
- // Validate URL
- if readmeURL == "" {
- return "", fmt.Errorf("empty README URL")
- }
-
- parsedURL, err := url.Parse(readmeURL)
- if err != nil {
- return "", fmt.Errorf("invalid README URL: %w", err)
- }
-
- // Only allow HTTP/HTTPS
- if parsedURL.Scheme != "http" && parsedURL.Scheme != "https" {
- return "", fmt.Errorf("invalid URL scheme: %s", parsedURL.Scheme)
- }
-
- // Fetch content
+ // Fetch content (includes URL validation, Content-Type check, and HTML detection)
content, baseURL, err := f.fetchContent(ctx, readmeURL)
if err != nil {
return "", err
@@ -100,8 +85,36 @@ func (f *Fetcher) FetchAndRender(ctx context.Context, readmeURL string) (string,
return html, nil
}
+// FetchRaw fetches raw README content from a URL without rendering
+// Returns raw bytes with Content-Type and HTML validation
+// Use this when you need to store the raw markdown (e.g., in PDS records)
+func (f *Fetcher) FetchRaw(ctx context.Context, readmeURL string) ([]byte, error) {
+ // Fetch content (includes URL validation, Content-Type check, and HTML detection)
+ content, _, err := f.fetchContent(ctx, readmeURL)
+ if err != nil {
+ return nil, err
+ }
+
+ return content, nil
+}
+
// fetchContent fetches the raw content from a URL
func (f *Fetcher) fetchContent(ctx context.Context, urlStr string) ([]byte, string, error) {
+ // Validate URL
+ if urlStr == "" {
+ return nil, "", fmt.Errorf("empty README URL")
+ }
+
+ parsedURL, err := url.Parse(urlStr)
+ if err != nil {
+ return nil, "", fmt.Errorf("invalid README URL: %w", err)
+ }
+
+ // Only allow HTTP/HTTPS
+ if parsedURL.Scheme != "http" && parsedURL.Scheme != "https" {
+ return nil, "", fmt.Errorf("invalid URL scheme: %s", parsedURL.Scheme)
+ }
+
req, err := http.NewRequestWithContext(ctx, "GET", urlStr, nil)
if err != nil {
return nil, "", fmt.Errorf("failed to create request: %w", err)
@@ -120,6 +133,15 @@ func (f *Fetcher) fetchContent(ctx context.Context, urlStr string) ([]byte, stri
return nil, "", fmt.Errorf("unexpected status code: %d", resp.StatusCode)
}
+ // Reject HTML content types (catches proper error pages)
+ contentType := resp.Header.Get("Content-Type")
+ if contentType != "" {
+ ct := strings.ToLower(contentType)
+ if strings.Contains(ct, "text/html") || strings.Contains(ct, "application/xhtml") {
+ return nil, "", fmt.Errorf("unsupported content type: %s (expected markdown or plain text)", contentType)
+ }
+ }
+
// Limit content size to 1MB
limitedReader := io.LimitReader(resp.Body, 1*1024*1024)
content, err := io.ReadAll(limitedReader)
@@ -127,12 +149,35 @@ func (f *Fetcher) fetchContent(ctx context.Context, urlStr string) ([]byte, stri
return nil, "", fmt.Errorf("failed to read response body: %w", err)
}
+ // Detect HTML content by checking for common markers (catches soft 404s)
+ if LooksLikeHTML(content) {
+ return nil, "", fmt.Errorf("detected HTML content instead of markdown")
+ }
+
// Get base URL for relative link resolution
baseURL := getBaseURL(resp.Request.URL)
return content, baseURL, nil
}
+// LooksLikeHTML checks if content appears to be HTML rather than markdown
+// Exported for use by other packages that fetch README content
+func LooksLikeHTML(content []byte) bool {
+ if len(content) == 0 {
+ return false
+ }
+
+ // Check first 512 bytes for HTML markers
+ checkLen := min(len(content), 512)
+
+ trimmed := strings.TrimSpace(string(content[:checkLen]))
+ lower := strings.ToLower(trimmed)
+
+ return strings.HasPrefix(lower, "\n
Page",
+ expected: true,
+ },
+ {
+ name: "doctype html lowercase",
+ content: "\nPage",
+ expected: true,
+ },
+ {
+ name: "html tag only",
+ content: "Page",
+ expected: true,
+ },
+ {
+ name: "html tag with whitespace",
+ content: " \n \nPage",
+ expected: true,
+ },
+ {
+ name: "xml declaration",
+ content: "\n...",
+ expected: true,
+ },
+ {
+ name: "soft 404 page",
+ content: "Page Not Found404
",
+ expected: true,
+ },
+ {
+ name: "markdown with inline html",
+ content: "# Title\n\nSome text with bold inline.",
+ expected: false,
+ },
+ {
+ name: "markdown starting with hash",
+ content: "## Section\n\nContent here.",
+ expected: false,
+ },
+ }
+
+ for _, tt := range tests {
+ t.Run(tt.name, func(t *testing.T) {
+ result := LooksLikeHTML([]byte(tt.content))
+ if result != tt.expected {
+ t.Errorf("looksLikeHTML(%q) = %v, want %v", tt.content, result, tt.expected)
+ }
+ })
+ }
+}
+
+func TestFetcher_FetchRaw(t *testing.T) {
+ fetcher := NewFetcher()
+
+ tests := []struct {
+ name string
+ handler http.HandlerFunc
+ wantErr bool
+ errContains string
+ wantContent string
+ }{
+ {
+ name: "successful markdown fetch",
+ handler: func(w http.ResponseWriter, r *http.Request) {
+ w.Header().Set("Content-Type", "text/plain")
+ w.Write([]byte("# Hello World\n\nThis is markdown."))
+ },
+ wantErr: false,
+ wantContent: "# Hello World",
+ },
+ {
+ name: "rejects HTML content type",
+ handler: func(w http.ResponseWriter, r *http.Request) {
+ w.Header().Set("Content-Type", "text/html; charset=utf-8")
+ w.Write([]byte("Error"))
+ },
+ wantErr: true,
+ errContains: "unsupported content type",
+ },
+ {
+ name: "rejects soft 404 HTML content",
+ handler: func(w http.ResponseWriter, r *http.Request) {
+ w.Header().Set("Content-Type", "text/plain")
+ w.Write([]byte("404 Not Found"))
+ },
+ wantErr: true,
+ errContains: "detected HTML content",
+ },
+ {
+ name: "rejects 404 status",
+ handler: func(w http.ResponseWriter, r *http.Request) {
+ w.WriteHeader(http.StatusNotFound)
+ w.Write([]byte("Not Found"))
+ },
+ wantErr: true,
+ errContains: "unexpected status code: 404",
+ },
+ {
+ name: "rejects 500 status",
+ handler: func(w http.ResponseWriter, r *http.Request) {
+ w.WriteHeader(http.StatusInternalServerError)
+ w.Write([]byte("Internal Server Error"))
+ },
+ wantErr: true,
+ errContains: "unexpected status code: 500",
+ },
+ }
+
+ for _, tt := range tests {
+ t.Run(tt.name, func(t *testing.T) {
+ server := httptest.NewServer(tt.handler)
+ defer server.Close()
+
+ content, err := fetcher.FetchRaw(context.Background(), server.URL)
+
+ if tt.wantErr {
+ if err == nil {
+ t.Errorf("FetchRaw() expected error containing %q, got nil", tt.errContains)
+ return
+ }
+ if !strings.Contains(err.Error(), tt.errContains) {
+ t.Errorf("FetchRaw() error = %q, want error containing %q", err.Error(), tt.errContains)
+ }
+ return
+ }
+
+ if err != nil {
+ t.Errorf("FetchRaw() unexpected error: %v", err)
+ return
+ }
+
+ if !strings.Contains(string(content), tt.wantContent) {
+ t.Errorf("FetchRaw() content = %q, want content containing %q", string(content), tt.wantContent)
+ }
+ })
+ }
+}
+
+func TestFetcher_FetchRaw_URLValidation(t *testing.T) {
+ fetcher := NewFetcher()
+
+ tests := []struct {
+ name string
+ url string
+ errContains string
+ }{
+ {
+ name: "empty URL",
+ url: "",
+ errContains: "empty README URL",
+ },
+ {
+ name: "invalid URL scheme",
+ url: "ftp://example.com/README.md",
+ errContains: "invalid URL scheme",
+ },
+ {
+ name: "file URL scheme",
+ url: "file:///etc/passwd",
+ errContains: "invalid URL scheme",
+ },
+ }
+
+ for _, tt := range tests {
+ t.Run(tt.name, func(t *testing.T) {
+ _, err := fetcher.FetchRaw(context.Background(), tt.url)
+ if err == nil {
+ t.Errorf("FetchRaw(%q) expected error, got nil", tt.url)
+ return
+ }
+ if !strings.Contains(err.Error(), tt.errContains) {
+ t.Errorf("FetchRaw(%q) error = %q, want error containing %q", tt.url, err.Error(), tt.errContains)
+ }
+ })
+ }
+}
+
+func TestFetcher_FetchAndRender(t *testing.T) {
+ fetcher := NewFetcher()
+
+ tests := []struct {
+ name string
+ handler http.HandlerFunc
+ wantErr bool
+ errContains string
+ wantContain string
+ }{
+ {
+ name: "renders markdown to HTML",
+ handler: func(w http.ResponseWriter, r *http.Request) {
+ w.Header().Set("Content-Type", "text/plain")
+ w.Write([]byte("# Hello World\n\nThis is **bold** text."))
+ },
+ wantErr: false,
+ wantContain: "bold",
+ },
+ {
+ name: "rejects HTML content type",
+ handler: func(w http.ResponseWriter, r *http.Request) {
+ w.Header().Set("Content-Type", "text/html")
+ w.Write([]byte("Error"))
+ },
+ wantErr: true,
+ errContains: "unsupported content type",
+ },
+ {
+ name: "rejects soft 404",
+ handler: func(w http.ResponseWriter, r *http.Request) {
+ w.Header().Set("Content-Type", "text/plain")
+ w.Write([]byte("Not Found"))
+ },
+ wantErr: true,
+ errContains: "detected HTML content",
+ },
+ }
+
+ for _, tt := range tests {
+ t.Run(tt.name, func(t *testing.T) {
+ server := httptest.NewServer(tt.handler)
+ defer server.Close()
+
+ html, err := fetcher.FetchAndRender(context.Background(), server.URL)
+
+ if tt.wantErr {
+ if err == nil {
+ t.Errorf("FetchAndRender() expected error containing %q, got nil", tt.errContains)
+ return
+ }
+ if !strings.Contains(err.Error(), tt.errContains) {
+ t.Errorf("FetchAndRender() error = %q, want error containing %q", err.Error(), tt.errContains)
+ }
+ return
+ }
+
+ if err != nil {
+ t.Errorf("FetchAndRender() unexpected error: %v", err)
+ return
+ }
+
+ if !strings.Contains(html, tt.wantContain) {
+ t.Errorf("FetchAndRender() = %q, want HTML containing %q", html, tt.wantContain)
+ }
+ })
+ }
+}
diff --git a/pkg/appview/storage/manifest_store.go b/pkg/appview/storage/manifest_store.go
index b6abf4c..7b4bc33 100644
--- a/pkg/appview/storage/manifest_store.go
+++ b/pkg/appview/storage/manifest_store.go
@@ -424,23 +424,30 @@ func (s *ManifestStore) notifyHoldAboutManifest(ctx context.Context, manifestRec
return nil
}
-// ensureRepoPage creates or updates a repo page record in the user's PDS if needed
+// ensureRepoPage creates or updates a repo page record in the user's PDS
// This syncs repository metadata from manifest annotations to the io.atcr.repo.page collection
-// Only creates a new record if one doesn't exist (doesn't overwrite user's custom content)
+// Always updates the description on push (since users can't edit it via appview yet)
+// Preserves user's avatar if they've set one via the appview
func (s *ManifestStore) ensureRepoPage(ctx context.Context, manifestRecord *atproto.ManifestRecord) {
- // Check if repo page already exists (don't overwrite user's custom content)
rkey := s.ctx.Repository
- _, err := s.ctx.ATProtoClient.GetRecord(ctx, atproto.RepoPageCollection, rkey)
- if err == nil {
- // Record already exists - don't overwrite
- slog.Debug("Repo page already exists, skipping creation", "did", s.ctx.DID, "repository", s.ctx.Repository)
- return
- }
- // Only continue if it's a "not found" error - other errors mean we should skip
- if !errors.Is(err, atproto.ErrRecordNotFound) {
+ // Check for existing record to preserve user's avatar
+ var existingAvatarRef *atproto.ATProtoBlobRef
+ var existingRecord *atproto.RepoPageRecord
+ record, err := s.ctx.ATProtoClient.GetRecord(ctx, atproto.RepoPageCollection, rkey)
+ if err == nil && record != nil {
+ // Unmarshal the Value to get the RepoPageRecord
+ var repoPage atproto.RepoPageRecord
+ if unmarshalErr := json.Unmarshal(record.Value, &repoPage); unmarshalErr == nil {
+ existingRecord = &repoPage
+ existingAvatarRef = repoPage.Avatar
+ slog.Debug("Found existing repo page, will update", "did", s.ctx.DID, "repository", s.ctx.Repository, "hasExistingAvatar", existingAvatarRef != nil)
+ } else {
+ slog.Warn("Failed to unmarshal existing repo page", "did", s.ctx.DID, "repository", s.ctx.Repository, "error", unmarshalErr)
+ }
+ } else if err != nil && !errors.Is(err, atproto.ErrRecordNotFound) {
+ // Unexpected error - log and continue (will create new record)
slog.Warn("Failed to check for existing repo page", "did", s.ctx.DID, "repository", s.ctx.Repository, "error", err)
- return
}
// Get annotations (may be nil if image has no OCI labels)
@@ -458,29 +465,37 @@ func (s *ManifestStore) ensureRepoPage(ctx context.Context, manifestRecord *atpr
description = annotations["org.opencontainers.image.description"]
}
- // Try to fetch and upload icon from io.atcr.icon annotation
- var avatarRef *atproto.ATProtoBlobRef
+ // Determine avatar: prefer new icon from annotations, otherwise keep existing
+ avatarRef := existingAvatarRef
if iconURL := annotations["io.atcr.icon"]; iconURL != "" {
- avatarRef = s.fetchAndUploadIcon(ctx, iconURL)
+ if newAvatar := s.fetchAndUploadIcon(ctx, iconURL); newAvatar != nil {
+ avatarRef = newAvatar
+ }
}
- // Create new repo page record with description and optional avatar
+ // Create/update repo page record with description and avatar
repoPage := atproto.NewRepoPageRecord(s.ctx.Repository, description, avatarRef)
- slog.Info("Creating repo page from manifest annotations", "did", s.ctx.DID, "repository", s.ctx.Repository, "descriptionLength", len(description), "hasAvatar", avatarRef != nil)
+ isUpdate := existingRecord != nil
+ action := "Creating"
+ if isUpdate {
+ action = "Updating"
+ }
+ slog.Info(action+" repo page from manifest annotations", "did", s.ctx.DID, "repository", s.ctx.Repository, "descriptionLength", len(description), "hasAvatar", avatarRef != nil)
_, err = s.ctx.ATProtoClient.PutRecord(ctx, atproto.RepoPageCollection, rkey, repoPage)
if err != nil {
- slog.Warn("Failed to create repo page", "did", s.ctx.DID, "repository", s.ctx.Repository, "error", err)
+ slog.Warn("Failed to "+strings.ToLower(action)+" repo page", "did", s.ctx.DID, "repository", s.ctx.Repository, "error", err)
return
}
- slog.Info("Repo page created successfully", "did", s.ctx.DID, "repository", s.ctx.Repository)
+ slog.Info("Repo page "+strings.ToLower(action)+"d successfully", "did", s.ctx.DID, "repository", s.ctx.Repository)
}
// fetchReadmeContent attempts to fetch README content from external sources
// Priority: io.atcr.readme annotation > derived from org.opencontainers.image.source
// Returns the raw markdown content, or empty string if not available
+// Uses the shared readme.Fetcher which validates Content-Type and rejects HTML content
func (s *ManifestStore) fetchReadmeContent(ctx context.Context, annotations map[string]string) string {
if s.ctx.ReadmeFetcher == nil {
return ""
@@ -492,12 +507,12 @@ func (s *ManifestStore) fetchReadmeContent(ctx context.Context, annotations map[
// Priority 1: Direct README URL from io.atcr.readme annotation
if readmeURL := annotations["io.atcr.readme"]; readmeURL != "" {
- content, err := s.fetchRawReadme(fetchCtx, readmeURL)
+ content, err := s.ctx.ReadmeFetcher.FetchRaw(fetchCtx, readmeURL)
if err != nil {
slog.Debug("Failed to fetch README from io.atcr.readme annotation", "url", readmeURL, "error", err)
- } else if content != "" {
+ } else if len(content) > 0 {
slog.Info("Fetched README from io.atcr.readme annotation", "url", readmeURL, "length", len(content))
- return content
+ return string(content)
}
}
@@ -510,7 +525,7 @@ func (s *ManifestStore) fetchReadmeContent(ctx context.Context, annotations map[
continue
}
- content, err := s.fetchRawReadme(fetchCtx, readmeURL)
+ content, err := s.ctx.ReadmeFetcher.FetchRaw(fetchCtx, readmeURL)
if err != nil {
// Only log non-404 errors (404 is expected when trying main vs master)
if !readme.Is404(err) {
@@ -519,9 +534,9 @@ func (s *ManifestStore) fetchReadmeContent(ctx context.Context, annotations map[
continue
}
- if content != "" {
+ if len(content) > 0 {
slog.Info("Fetched README from source URL", "sourceURL", sourceURL, "branch", branch, "length", len(content))
- return content
+ return string(content)
}
}
}
@@ -529,48 +544,6 @@ func (s *ManifestStore) fetchReadmeContent(ctx context.Context, annotations map[
return ""
}
-// fetchRawReadme fetches raw markdown content from a URL
-// Returns the raw markdown (not rendered HTML) for storage in the repo page record
-func (s *ManifestStore) fetchRawReadme(ctx context.Context, readmeURL string) (string, error) {
- // Use a simple HTTP client to fetch raw content
- // We want raw markdown, not rendered HTML (the Fetcher renders to HTML)
- req, err := http.NewRequestWithContext(ctx, "GET", readmeURL, nil)
- if err != nil {
- return "", fmt.Errorf("failed to create request: %w", err)
- }
-
- req.Header.Set("User-Agent", "ATCR-README-Fetcher/1.0")
-
- client := &http.Client{
- Timeout: 10 * time.Second,
- CheckRedirect: func(req *http.Request, via []*http.Request) error {
- if len(via) >= 5 {
- return fmt.Errorf("too many redirects")
- }
- return nil
- },
- }
-
- resp, err := client.Do(req)
- if err != nil {
- return "", fmt.Errorf("failed to fetch URL: %w", err)
- }
- defer resp.Body.Close()
-
- if resp.StatusCode != http.StatusOK {
- return "", fmt.Errorf("unexpected status code: %d", resp.StatusCode)
- }
-
- // Limit content size to 100KB (repo page description has 100KB limit in lexicon)
- limitedReader := io.LimitReader(resp.Body, 100*1024)
- content, err := io.ReadAll(limitedReader)
- if err != nil {
- return "", fmt.Errorf("failed to read response body: %w", err)
- }
-
- return string(content), nil
-}
-
// fetchAndUploadIcon fetches an image from a URL and uploads it as a blob to the user's PDS
// Returns the blob reference for use in the repo page record, or nil on error
func (s *ManifestStore) fetchAndUploadIcon(ctx context.Context, iconURL string) *atproto.ATProtoBlobRef {