diff --git a/pkg/appview/readme/fetcher.go b/pkg/appview/readme/fetcher.go index 47d81f0..fa621d3 100644 --- a/pkg/appview/readme/fetcher.go +++ b/pkg/appview/readme/fetcher.go @@ -70,22 +70,7 @@ func NewFetcher() *Fetcher { // FetchAndRender fetches a README from a URL and renders it as HTML // Returns the rendered HTML and any error func (f *Fetcher) FetchAndRender(ctx context.Context, readmeURL string) (string, error) { - // Validate URL - if readmeURL == "" { - return "", fmt.Errorf("empty README URL") - } - - parsedURL, err := url.Parse(readmeURL) - if err != nil { - return "", fmt.Errorf("invalid README URL: %w", err) - } - - // Only allow HTTP/HTTPS - if parsedURL.Scheme != "http" && parsedURL.Scheme != "https" { - return "", fmt.Errorf("invalid URL scheme: %s", parsedURL.Scheme) - } - - // Fetch content + // Fetch content (includes URL validation, Content-Type check, and HTML detection) content, baseURL, err := f.fetchContent(ctx, readmeURL) if err != nil { return "", err @@ -100,8 +85,36 @@ func (f *Fetcher) FetchAndRender(ctx context.Context, readmeURL string) (string, return html, nil } +// FetchRaw fetches raw README content from a URL without rendering +// Returns raw bytes with Content-Type and HTML validation +// Use this when you need to store the raw markdown (e.g., in PDS records) +func (f *Fetcher) FetchRaw(ctx context.Context, readmeURL string) ([]byte, error) { + // Fetch content (includes URL validation, Content-Type check, and HTML detection) + content, _, err := f.fetchContent(ctx, readmeURL) + if err != nil { + return nil, err + } + + return content, nil +} + // fetchContent fetches the raw content from a URL func (f *Fetcher) fetchContent(ctx context.Context, urlStr string) ([]byte, string, error) { + // Validate URL + if urlStr == "" { + return nil, "", fmt.Errorf("empty README URL") + } + + parsedURL, err := url.Parse(urlStr) + if err != nil { + return nil, "", fmt.Errorf("invalid README URL: %w", err) + } + + // Only allow HTTP/HTTPS + if parsedURL.Scheme != "http" && parsedURL.Scheme != "https" { + return nil, "", fmt.Errorf("invalid URL scheme: %s", parsedURL.Scheme) + } + req, err := http.NewRequestWithContext(ctx, "GET", urlStr, nil) if err != nil { return nil, "", fmt.Errorf("failed to create request: %w", err) @@ -120,6 +133,15 @@ func (f *Fetcher) fetchContent(ctx context.Context, urlStr string) ([]byte, stri return nil, "", fmt.Errorf("unexpected status code: %d", resp.StatusCode) } + // Reject HTML content types (catches proper error pages) + contentType := resp.Header.Get("Content-Type") + if contentType != "" { + ct := strings.ToLower(contentType) + if strings.Contains(ct, "text/html") || strings.Contains(ct, "application/xhtml") { + return nil, "", fmt.Errorf("unsupported content type: %s (expected markdown or plain text)", contentType) + } + } + // Limit content size to 1MB limitedReader := io.LimitReader(resp.Body, 1*1024*1024) content, err := io.ReadAll(limitedReader) @@ -127,12 +149,35 @@ func (f *Fetcher) fetchContent(ctx context.Context, urlStr string) ([]byte, stri return nil, "", fmt.Errorf("failed to read response body: %w", err) } + // Detect HTML content by checking for common markers (catches soft 404s) + if LooksLikeHTML(content) { + return nil, "", fmt.Errorf("detected HTML content instead of markdown") + } + // Get base URL for relative link resolution baseURL := getBaseURL(resp.Request.URL) return content, baseURL, nil } +// LooksLikeHTML checks if content appears to be HTML rather than markdown +// Exported for use by other packages that fetch README content +func LooksLikeHTML(content []byte) bool { + if len(content) == 0 { + return false + } + + // Check first 512 bytes for HTML markers + checkLen := min(len(content), 512) + + trimmed := strings.TrimSpace(string(content[:checkLen])) + lower := strings.ToLower(trimmed) + + return strings.HasPrefix(lower, "\nPage", + expected: true, + }, + { + name: "doctype html lowercase", + content: "\nPage", + expected: true, + }, + { + name: "html tag only", + content: "Page", + expected: true, + }, + { + name: "html tag with whitespace", + content: " \n \nPage", + expected: true, + }, + { + name: "xml declaration", + content: "\n...", + expected: true, + }, + { + name: "soft 404 page", + content: "Page Not Found

404

", + expected: true, + }, + { + name: "markdown with inline html", + content: "# Title\n\nSome text with bold inline.", + expected: false, + }, + { + name: "markdown starting with hash", + content: "## Section\n\nContent here.", + expected: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result := LooksLikeHTML([]byte(tt.content)) + if result != tt.expected { + t.Errorf("looksLikeHTML(%q) = %v, want %v", tt.content, result, tt.expected) + } + }) + } +} + +func TestFetcher_FetchRaw(t *testing.T) { + fetcher := NewFetcher() + + tests := []struct { + name string + handler http.HandlerFunc + wantErr bool + errContains string + wantContent string + }{ + { + name: "successful markdown fetch", + handler: func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "text/plain") + w.Write([]byte("# Hello World\n\nThis is markdown.")) + }, + wantErr: false, + wantContent: "# Hello World", + }, + { + name: "rejects HTML content type", + handler: func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "text/html; charset=utf-8") + w.Write([]byte("Error")) + }, + wantErr: true, + errContains: "unsupported content type", + }, + { + name: "rejects soft 404 HTML content", + handler: func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "text/plain") + w.Write([]byte("404 Not Found")) + }, + wantErr: true, + errContains: "detected HTML content", + }, + { + name: "rejects 404 status", + handler: func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusNotFound) + w.Write([]byte("Not Found")) + }, + wantErr: true, + errContains: "unexpected status code: 404", + }, + { + name: "rejects 500 status", + handler: func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusInternalServerError) + w.Write([]byte("Internal Server Error")) + }, + wantErr: true, + errContains: "unexpected status code: 500", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + server := httptest.NewServer(tt.handler) + defer server.Close() + + content, err := fetcher.FetchRaw(context.Background(), server.URL) + + if tt.wantErr { + if err == nil { + t.Errorf("FetchRaw() expected error containing %q, got nil", tt.errContains) + return + } + if !strings.Contains(err.Error(), tt.errContains) { + t.Errorf("FetchRaw() error = %q, want error containing %q", err.Error(), tt.errContains) + } + return + } + + if err != nil { + t.Errorf("FetchRaw() unexpected error: %v", err) + return + } + + if !strings.Contains(string(content), tt.wantContent) { + t.Errorf("FetchRaw() content = %q, want content containing %q", string(content), tt.wantContent) + } + }) + } +} + +func TestFetcher_FetchRaw_URLValidation(t *testing.T) { + fetcher := NewFetcher() + + tests := []struct { + name string + url string + errContains string + }{ + { + name: "empty URL", + url: "", + errContains: "empty README URL", + }, + { + name: "invalid URL scheme", + url: "ftp://example.com/README.md", + errContains: "invalid URL scheme", + }, + { + name: "file URL scheme", + url: "file:///etc/passwd", + errContains: "invalid URL scheme", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + _, err := fetcher.FetchRaw(context.Background(), tt.url) + if err == nil { + t.Errorf("FetchRaw(%q) expected error, got nil", tt.url) + return + } + if !strings.Contains(err.Error(), tt.errContains) { + t.Errorf("FetchRaw(%q) error = %q, want error containing %q", tt.url, err.Error(), tt.errContains) + } + }) + } +} + +func TestFetcher_FetchAndRender(t *testing.T) { + fetcher := NewFetcher() + + tests := []struct { + name string + handler http.HandlerFunc + wantErr bool + errContains string + wantContain string + }{ + { + name: "renders markdown to HTML", + handler: func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "text/plain") + w.Write([]byte("# Hello World\n\nThis is **bold** text.")) + }, + wantErr: false, + wantContain: "bold", + }, + { + name: "rejects HTML content type", + handler: func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "text/html") + w.Write([]byte("Error")) + }, + wantErr: true, + errContains: "unsupported content type", + }, + { + name: "rejects soft 404", + handler: func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "text/plain") + w.Write([]byte("Not Found")) + }, + wantErr: true, + errContains: "detected HTML content", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + server := httptest.NewServer(tt.handler) + defer server.Close() + + html, err := fetcher.FetchAndRender(context.Background(), server.URL) + + if tt.wantErr { + if err == nil { + t.Errorf("FetchAndRender() expected error containing %q, got nil", tt.errContains) + return + } + if !strings.Contains(err.Error(), tt.errContains) { + t.Errorf("FetchAndRender() error = %q, want error containing %q", err.Error(), tt.errContains) + } + return + } + + if err != nil { + t.Errorf("FetchAndRender() unexpected error: %v", err) + return + } + + if !strings.Contains(html, tt.wantContain) { + t.Errorf("FetchAndRender() = %q, want HTML containing %q", html, tt.wantContain) + } + }) + } +} diff --git a/pkg/appview/storage/manifest_store.go b/pkg/appview/storage/manifest_store.go index b6abf4c..7b4bc33 100644 --- a/pkg/appview/storage/manifest_store.go +++ b/pkg/appview/storage/manifest_store.go @@ -424,23 +424,30 @@ func (s *ManifestStore) notifyHoldAboutManifest(ctx context.Context, manifestRec return nil } -// ensureRepoPage creates or updates a repo page record in the user's PDS if needed +// ensureRepoPage creates or updates a repo page record in the user's PDS // This syncs repository metadata from manifest annotations to the io.atcr.repo.page collection -// Only creates a new record if one doesn't exist (doesn't overwrite user's custom content) +// Always updates the description on push (since users can't edit it via appview yet) +// Preserves user's avatar if they've set one via the appview func (s *ManifestStore) ensureRepoPage(ctx context.Context, manifestRecord *atproto.ManifestRecord) { - // Check if repo page already exists (don't overwrite user's custom content) rkey := s.ctx.Repository - _, err := s.ctx.ATProtoClient.GetRecord(ctx, atproto.RepoPageCollection, rkey) - if err == nil { - // Record already exists - don't overwrite - slog.Debug("Repo page already exists, skipping creation", "did", s.ctx.DID, "repository", s.ctx.Repository) - return - } - // Only continue if it's a "not found" error - other errors mean we should skip - if !errors.Is(err, atproto.ErrRecordNotFound) { + // Check for existing record to preserve user's avatar + var existingAvatarRef *atproto.ATProtoBlobRef + var existingRecord *atproto.RepoPageRecord + record, err := s.ctx.ATProtoClient.GetRecord(ctx, atproto.RepoPageCollection, rkey) + if err == nil && record != nil { + // Unmarshal the Value to get the RepoPageRecord + var repoPage atproto.RepoPageRecord + if unmarshalErr := json.Unmarshal(record.Value, &repoPage); unmarshalErr == nil { + existingRecord = &repoPage + existingAvatarRef = repoPage.Avatar + slog.Debug("Found existing repo page, will update", "did", s.ctx.DID, "repository", s.ctx.Repository, "hasExistingAvatar", existingAvatarRef != nil) + } else { + slog.Warn("Failed to unmarshal existing repo page", "did", s.ctx.DID, "repository", s.ctx.Repository, "error", unmarshalErr) + } + } else if err != nil && !errors.Is(err, atproto.ErrRecordNotFound) { + // Unexpected error - log and continue (will create new record) slog.Warn("Failed to check for existing repo page", "did", s.ctx.DID, "repository", s.ctx.Repository, "error", err) - return } // Get annotations (may be nil if image has no OCI labels) @@ -458,29 +465,37 @@ func (s *ManifestStore) ensureRepoPage(ctx context.Context, manifestRecord *atpr description = annotations["org.opencontainers.image.description"] } - // Try to fetch and upload icon from io.atcr.icon annotation - var avatarRef *atproto.ATProtoBlobRef + // Determine avatar: prefer new icon from annotations, otherwise keep existing + avatarRef := existingAvatarRef if iconURL := annotations["io.atcr.icon"]; iconURL != "" { - avatarRef = s.fetchAndUploadIcon(ctx, iconURL) + if newAvatar := s.fetchAndUploadIcon(ctx, iconURL); newAvatar != nil { + avatarRef = newAvatar + } } - // Create new repo page record with description and optional avatar + // Create/update repo page record with description and avatar repoPage := atproto.NewRepoPageRecord(s.ctx.Repository, description, avatarRef) - slog.Info("Creating repo page from manifest annotations", "did", s.ctx.DID, "repository", s.ctx.Repository, "descriptionLength", len(description), "hasAvatar", avatarRef != nil) + isUpdate := existingRecord != nil + action := "Creating" + if isUpdate { + action = "Updating" + } + slog.Info(action+" repo page from manifest annotations", "did", s.ctx.DID, "repository", s.ctx.Repository, "descriptionLength", len(description), "hasAvatar", avatarRef != nil) _, err = s.ctx.ATProtoClient.PutRecord(ctx, atproto.RepoPageCollection, rkey, repoPage) if err != nil { - slog.Warn("Failed to create repo page", "did", s.ctx.DID, "repository", s.ctx.Repository, "error", err) + slog.Warn("Failed to "+strings.ToLower(action)+" repo page", "did", s.ctx.DID, "repository", s.ctx.Repository, "error", err) return } - slog.Info("Repo page created successfully", "did", s.ctx.DID, "repository", s.ctx.Repository) + slog.Info("Repo page "+strings.ToLower(action)+"d successfully", "did", s.ctx.DID, "repository", s.ctx.Repository) } // fetchReadmeContent attempts to fetch README content from external sources // Priority: io.atcr.readme annotation > derived from org.opencontainers.image.source // Returns the raw markdown content, or empty string if not available +// Uses the shared readme.Fetcher which validates Content-Type and rejects HTML content func (s *ManifestStore) fetchReadmeContent(ctx context.Context, annotations map[string]string) string { if s.ctx.ReadmeFetcher == nil { return "" @@ -492,12 +507,12 @@ func (s *ManifestStore) fetchReadmeContent(ctx context.Context, annotations map[ // Priority 1: Direct README URL from io.atcr.readme annotation if readmeURL := annotations["io.atcr.readme"]; readmeURL != "" { - content, err := s.fetchRawReadme(fetchCtx, readmeURL) + content, err := s.ctx.ReadmeFetcher.FetchRaw(fetchCtx, readmeURL) if err != nil { slog.Debug("Failed to fetch README from io.atcr.readme annotation", "url", readmeURL, "error", err) - } else if content != "" { + } else if len(content) > 0 { slog.Info("Fetched README from io.atcr.readme annotation", "url", readmeURL, "length", len(content)) - return content + return string(content) } } @@ -510,7 +525,7 @@ func (s *ManifestStore) fetchReadmeContent(ctx context.Context, annotations map[ continue } - content, err := s.fetchRawReadme(fetchCtx, readmeURL) + content, err := s.ctx.ReadmeFetcher.FetchRaw(fetchCtx, readmeURL) if err != nil { // Only log non-404 errors (404 is expected when trying main vs master) if !readme.Is404(err) { @@ -519,9 +534,9 @@ func (s *ManifestStore) fetchReadmeContent(ctx context.Context, annotations map[ continue } - if content != "" { + if len(content) > 0 { slog.Info("Fetched README from source URL", "sourceURL", sourceURL, "branch", branch, "length", len(content)) - return content + return string(content) } } } @@ -529,48 +544,6 @@ func (s *ManifestStore) fetchReadmeContent(ctx context.Context, annotations map[ return "" } -// fetchRawReadme fetches raw markdown content from a URL -// Returns the raw markdown (not rendered HTML) for storage in the repo page record -func (s *ManifestStore) fetchRawReadme(ctx context.Context, readmeURL string) (string, error) { - // Use a simple HTTP client to fetch raw content - // We want raw markdown, not rendered HTML (the Fetcher renders to HTML) - req, err := http.NewRequestWithContext(ctx, "GET", readmeURL, nil) - if err != nil { - return "", fmt.Errorf("failed to create request: %w", err) - } - - req.Header.Set("User-Agent", "ATCR-README-Fetcher/1.0") - - client := &http.Client{ - Timeout: 10 * time.Second, - CheckRedirect: func(req *http.Request, via []*http.Request) error { - if len(via) >= 5 { - return fmt.Errorf("too many redirects") - } - return nil - }, - } - - resp, err := client.Do(req) - if err != nil { - return "", fmt.Errorf("failed to fetch URL: %w", err) - } - defer resp.Body.Close() - - if resp.StatusCode != http.StatusOK { - return "", fmt.Errorf("unexpected status code: %d", resp.StatusCode) - } - - // Limit content size to 100KB (repo page description has 100KB limit in lexicon) - limitedReader := io.LimitReader(resp.Body, 100*1024) - content, err := io.ReadAll(limitedReader) - if err != nil { - return "", fmt.Errorf("failed to read response body: %w", err) - } - - return string(content), nil -} - // fetchAndUploadIcon fetches an image from a URL and uploads it as a blob to the user's PDS // Returns the blob reference for use in the repo page record, or nil on error func (s *ManifestStore) fetchAndUploadIcon(ctx context.Context, iconURL string) *atproto.ATProtoBlobRef {