Files
at-container-registry/pkg/appview/jetstream/backfill.go
2026-04-19 18:01:17 -05:00

841 lines
30 KiB
Go

package jetstream
import (
"context"
"database/sql"
"encoding/json"
"errors"
"fmt"
"io"
"log/slog"
"net/http"
"strings"
"time"
"atcr.io/pkg/appview/db"
"atcr.io/pkg/appview/readme"
"atcr.io/pkg/atproto"
"atcr.io/pkg/auth/oauth"
)
// BackfillWorker uses com.atproto.sync.listReposByCollection to backfill historical data
type BackfillWorker struct {
db *sql.DB
endpoints *EndpointRotator
processor *Processor // Shared processor for DB operations
defaultHoldDID string // Default hold DID from AppView config (e.g., "did:web:hold01.atcr.io")
testMode bool // If true, suppress warnings for external holds
refresher *oauth.Refresher // OAuth refresher for PDS writes (optional, can be nil)
}
// BackfillState tracks backfill progress
type BackfillState struct {
Collection string
RepoCursor string // Cursor for listReposByCollection
CurrentDID string // Current DID being processed
RecordCursor string // Cursor for listRecords within current DID
ProcessedRepos int
ProcessedRecords int
Completed bool
}
// NewBackfillWorker creates a backfill worker using sync API
// defaultHoldDID should be in format "did:web:hold01.atcr.io"
// To find a hold's DID, visit: https://hold-url/.well-known/did.json
// refresher is optional - if provided, backfill will try to update PDS records when fetching README content
func NewBackfillWorker(database *sql.DB, relayEndpoints []string, defaultHoldDID string, testMode bool, refresher *oauth.Refresher) (*BackfillWorker, error) {
if len(relayEndpoints) == 0 {
relayEndpoints = []string{"https://relay1.us-east.bsky.network"}
}
return &BackfillWorker{
db: database,
endpoints: NewEndpointRotator(relayEndpoints),
processor: NewProcessor(database, false, NewStatsCache()), // Stats cache for aggregation
defaultHoldDID: defaultHoldDID,
testMode: testMode,
refresher: refresher,
}, nil
}
// Start runs the backfill for all ATCR collections
func (b *BackfillWorker) Start(ctx context.Context) error {
slog.Info("Backfill: Starting sync-based backfill...")
// First, query and cache the default hold's captain and crew records
// This is necessary for localhost/private holds not discoverable via relay
if b.defaultHoldDID != "" {
slog.Info("Backfill querying default hold records", "hold_did", b.defaultHoldDID)
if err := b.queryCaptainRecord(ctx, b.defaultHoldDID); err != nil {
slog.Warn("Backfill failed to query default hold captain record", "error", err)
// Don't fail the whole backfill - just warn
}
if err := b.queryCrewRecords(ctx, b.defaultHoldDID); err != nil {
slog.Warn("Backfill failed to query default hold crew records", "error", err)
// Don't fail the whole backfill - just warn
}
}
collections := []string{
atproto.ManifestCollection, // io.atcr.manifest
atproto.TagCollection, // io.atcr.tag
atproto.StarCollection, // io.atcr.sailor.star
atproto.SailorProfileCollection, // io.atcr.sailor.profile
atproto.RepoPageCollection, // io.atcr.repo.page
atproto.StatsCollection, // io.atcr.hold.stats (from holds)
atproto.DailyStatsCollection, // io.atcr.hold.stats.daily (from holds)
atproto.CaptainCollection, // io.atcr.hold.captain (from holds)
atproto.CrewCollection, // io.atcr.hold.crew (from holds)
atproto.ScanCollection, // io.atcr.hold.scan (from holds)
}
for _, collection := range collections {
slog.Info("Backfill processing collection", "collection", collection)
if err := b.backfillCollection(ctx, collection); err != nil {
return fmt.Errorf("failed to backfill collection %s: %w", collection, err)
}
slog.Info("Backfill completed collection", "collection", collection)
}
slog.Info("Backfill: All collections completed!")
return nil
}
// listReposByCollectionWithFailover tries all relay endpoints to list repos for a collection.
// On failure it advances to the next endpoint. Returns error only if all endpoints fail.
func (b *BackfillWorker) listReposByCollectionWithFailover(ctx context.Context, collection string, limit int, cursor string) (*atproto.ListReposByCollectionResult, error) {
var lastErr error
for i := 0; i < b.endpoints.Len(); i++ {
endpoint := b.endpoints.Current()
client := atproto.NewClient(endpoint, "", "")
result, err := client.ListReposByCollection(ctx, collection, limit, cursor)
if err == nil {
return result, nil
}
lastErr = err
nextEndpoint := b.endpoints.Next()
slog.Warn("Backfill relay failed, trying next",
"failed_endpoint", endpoint,
"next_endpoint", nextEndpoint,
"error", err)
}
return nil, fmt.Errorf("all relay endpoints failed: %w", lastErr)
}
// backfillCollection backfills a single collection
func (b *BackfillWorker) backfillCollection(ctx context.Context, collection string) error {
var repoCursor string
processedRepos := 0
processedRecords := 0
// Paginate through all repos with this collection
for {
// List repos that have records in this collection (with relay failover)
result, err := b.listReposByCollectionWithFailover(ctx, collection, 1000, repoCursor)
if err != nil {
return fmt.Errorf("failed to list repos: %w", err)
}
slog.Info("Backfill found repos with collection", "count", len(result.Repos), "collection", collection, "cursor", repoCursor)
// Process each repo (DID)
for _, repo := range result.Repos {
recordCount, err := b.backfillRepo(ctx, repo.DID, collection)
if err != nil {
// Account may be deleted/deactivated/migrated - clean up our cached data
errStr := err.Error()
if strings.Contains(errStr, "RepoNotFound") ||
strings.Contains(errStr, "Could not find repo") ||
strings.Contains(errStr, "status 400") ||
strings.Contains(errStr, "status 404") {
deleted, delErr := db.DeleteUserData(b.db, repo.DID)
if delErr != nil {
slog.Warn("Backfill failed to delete data for removed repo", "did", repo.DID, "error", delErr)
} else if deleted {
slog.Info("Backfill cleaned up data for deleted/deactivated repo", "did", repo.DID)
}
} else {
slog.Warn("Backfill failed to backfill repo", "did", repo.DID, "error", err)
}
continue
}
processedRepos++
processedRecords += recordCount
if processedRepos%10 == 0 {
slog.Info("Backfill progress", "repos", processedRepos, "records", processedRecords)
}
}
// Check if there are more pages
if result.Cursor == "" {
break
}
repoCursor = result.Cursor
}
slog.Info("Backfill collection complete", "collection", collection, "repos", processedRepos, "records", processedRecords)
return nil
}
// backfillRepo backfills all records for a single repo/DID.
// Records are fetched from PDS first, then network-dependent caches are warmed,
// and finally DB writes happen in chunked transactions to batch writes while
// staying under the remote SQLite transaction timeout (~5s on Bunny Database).
func (b *BackfillWorker) backfillRepo(ctx context.Context, did, collection string) (int, error) {
// Resolve DID to get user's PDS endpoint
pdsEndpoint, err := atproto.ResolveDIDToPDS(ctx, did)
if err != nil {
return 0, fmt.Errorf("failed to resolve DID to PDS: %w", err)
}
// Create a client for this user's PDS with the user's DID
pdsClient := atproto.NewClient(pdsEndpoint, did, "")
var recordCursor string
// Track which records exist on the PDS for reconciliation
var foundManifestDigests []string
var foundTags []struct{ Repository, Tag string }
foundStars := make(map[string]time.Time) // key: "ownerDID/repository", value: createdAt
// Phase 1: Collect all records from PDS (network I/O, no transaction)
var allRecords []atproto.Record
for {
records, cursor, err := pdsClient.ListRecordsForRepo(ctx, did, collection, 100, recordCursor)
if err != nil {
return 0, fmt.Errorf("failed to list records: %w", err)
}
for _, record := range records {
switch collection {
case atproto.ManifestCollection:
var manifestRecord atproto.ManifestRecord
if err := json.Unmarshal(record.Value, &manifestRecord); err == nil {
foundManifestDigests = append(foundManifestDigests, manifestRecord.Digest)
}
case atproto.TagCollection:
var tagRecord atproto.TagRecord
if err := json.Unmarshal(record.Value, &tagRecord); err == nil {
foundTags = append(foundTags, struct{ Repository, Tag string }{
Repository: tagRecord.Repository,
Tag: tagRecord.Tag,
})
}
case atproto.StarCollection:
var starRecord atproto.StarRecord
if err := json.Unmarshal(record.Value, &starRecord); err == nil {
if ownerDID, repository, err := starRecord.GetSubjectDIDAndRepository(); err == nil {
key := fmt.Sprintf("%s/%s", ownerDID, repository)
foundStars[key] = starRecord.CreatedAt
}
}
}
allRecords = append(allRecords, record)
}
if cursor == "" {
break
}
recordCursor = cursor
}
// Phase 2: Pre-warm caches outside any transaction so that ProcessRecord
// inside transactions hits only DB (no network I/O that could cause timeouts).
// Ensure user exists in DB (resolves DID → handle/PDS, fetches profile)
switch collection {
case atproto.SailorProfileCollection:
if err := b.processor.EnsureUser(ctx, did); err != nil {
slog.Warn("Backfill failed to pre-ensure user", "did", did, "error", err)
}
case atproto.ManifestCollection, atproto.TagCollection, atproto.StarCollection, atproto.RepoPageCollection:
if err := b.processor.EnsureUserExists(ctx, did); err != nil {
slog.Warn("Backfill failed to pre-ensure user", "did", did, "error", err)
}
}
// Pre-cache hold DIDs and captain records referenced in records.
// ProcessSailorProfile calls ResolveHoldDID + queryCaptainFn,
// ProcessManifest calls ResolveHoldDID for legacy manifests.
b.prewarmHoldCaches(ctx, collection, allRecords)
// Phase 3: Write records to the DB.
//
// For collections whose writes are straightforward idempotent upserts, we
// batch every record in the repo into one multi-row INSERT per table. This
// replaces the previous 20-record chunked transaction loop, which exceeded
// Bunny Database's remote transaction timeout (~5s) once chunks grew large
// and poisoned the connection pool on timeout.
//
// Collections that do network I/O per record (SailorProfile) or have
// conditional read-then-write logic (Scan) stay on the single-record path
// where each write is its own statement and cannot hold a long transaction.
var recordCount int
var procErr error
switch collection {
case atproto.ManifestCollection:
recordCount, procErr = b.batchManifests(ctx, did, allRecords)
case atproto.TagCollection:
recordCount, procErr = b.batchTags(did, allRecords)
case atproto.StarCollection:
recordCount, procErr = b.batchStars(ctx, did, allRecords)
case atproto.RepoPageCollection:
recordCount, procErr = b.batchRepoPages(did, allRecords)
case atproto.DailyStatsCollection:
recordCount, procErr = b.batchDailyStats(ctx, did, allRecords)
case atproto.StatsCollection:
recordCount, procErr = b.batchStats(ctx, did, allRecords)
case atproto.CaptainCollection:
recordCount, procErr = b.batchCaptains(did, allRecords)
case atproto.CrewCollection:
recordCount, procErr = b.batchCrew(did, allRecords)
default:
// SailorProfileCollection and ScanCollection keep per-record processing
// because they do network I/O or conditional reads that would be awkward
// to batch. Each call writes a single row, so there is no long-lived
// transaction at risk.
for i := range allRecords {
if err := b.processRecordWith(ctx, b.processor, did, collection, &allRecords[i]); err != nil {
slog.Warn("Backfill failed to process record", "uri", allRecords[i].URI, "error", err)
continue
}
recordCount++
}
}
if procErr != nil {
return recordCount, procErr
}
// Reconciliation runs outside the transaction (involves network I/O and fewer writes)
// Reconcile deletions - remove records from DB that no longer exist on PDS
if err := b.reconcileDeletions(did, collection, foundManifestDigests, foundTags, foundStars); err != nil {
slog.Warn("Backfill failed to reconcile deletions", "did", did, "error", err)
}
// After processing manifests, clean up orphaned tags (tags pointing to non-existent manifests)
if collection == atproto.ManifestCollection {
if err := db.CleanupOrphanedTags(b.db, did); err != nil {
slog.Warn("Backfill failed to cleanup orphaned tags", "did", did, "error", err)
}
// Reconcile annotations - ensure they come from newest manifest per repository
// This fixes out-of-order backfill where older manifests can overwrite newer annotations
if err := b.reconcileAnnotations(ctx, did, pdsClient); err != nil {
slog.Warn("Backfill failed to reconcile annotations", "did", did, "error", err)
}
// Refresh user's avatar from their Bluesky profile
// This ensures cached avatars stay fresh even if the user changes their profile pic
if err := b.processor.RefreshUserAvatar(ctx, did, pdsEndpoint); err != nil {
slog.Warn("Backfill failed to refresh avatar", "did", did, "error", err)
}
}
// After processing repo pages, fetch descriptions from external sources if empty
if collection == atproto.RepoPageCollection {
if err := b.reconcileRepoPageDescriptions(ctx, did, pdsEndpoint); err != nil {
slog.Warn("Backfill failed to reconcile repo page descriptions", "did", did, "error", err)
}
}
return recordCount, nil
}
// reconcileDeletions removes records from the database that no longer exist on the PDS
func (b *BackfillWorker) reconcileDeletions(did, collection string, foundManifestDigests []string, foundTags []struct{ Repository, Tag string }, foundStars map[string]time.Time) error {
switch collection {
case atproto.ManifestCollection:
// Get current manifests in DB
dbDigests, err := db.GetManifestDigestsForDID(b.db, did)
if err != nil {
return fmt.Errorf("failed to get DB manifests: %w", err)
}
// Delete manifests not found on PDS
if err := db.DeleteManifestsNotInList(b.db, did, foundManifestDigests); err != nil {
return fmt.Errorf("failed to delete orphaned manifests: %w", err)
}
// Log deletions
deleted := len(dbDigests) - len(foundManifestDigests)
if deleted > 0 {
slog.Info("Backfill deleted orphaned manifests", "count", deleted, "did", did)
}
case atproto.TagCollection:
// Get current tags in DB
dbTags, err := db.GetTagsForDID(b.db, did)
if err != nil {
return fmt.Errorf("failed to get DB tags: %w", err)
}
// Delete tags not found on PDS
if err := db.DeleteTagsNotInList(b.db, did, foundTags); err != nil {
return fmt.Errorf("failed to delete orphaned tags: %w", err)
}
// Log deletions
deleted := len(dbTags) - len(foundTags)
if deleted > 0 {
slog.Info("Backfill deleted orphaned tags", "count", deleted, "did", did)
}
case atproto.StarCollection:
// Reconcile stars - delete stars that no longer exist on PDS
// Star counts will be calculated on demand from the stars table
if err := db.DeleteStarsNotInList(b.db, did, foundStars); err != nil {
return fmt.Errorf("failed to delete orphaned stars: %w", err)
}
}
return nil
}
// prewarmHoldCaches resolves hold DIDs and caches captain records before
// records are processed inside transactions. This ensures ProcessRecord's
// network-dependent code paths (ResolveHoldDID, queryCaptainRecord) hit
// cached data so transactions stay fast and don't timeout.
func (b *BackfillWorker) prewarmHoldCaches(ctx context.Context, collection string, records []atproto.Record) {
seen := make(map[string]bool)
for _, record := range records {
var holdRef string
switch collection {
case atproto.SailorProfileCollection:
var profileRecord atproto.SailorProfileRecord
if err := json.Unmarshal(record.Value, &profileRecord); err == nil {
holdRef = profileRecord.DefaultHold
}
case atproto.ManifestCollection:
var manifestRecord atproto.ManifestRecord
if err := json.Unmarshal(record.Value, &manifestRecord); err == nil {
// Only legacy manifests need network resolution (URL → DID)
if manifestRecord.HoldDID == "" && manifestRecord.HoldEndpoint != "" {
holdRef = manifestRecord.HoldEndpoint
}
}
default:
return // No hold references in other collections
}
if holdRef == "" || seen[holdRef] {
continue
}
seen[holdRef] = true
// Resolve hold identifier to DID (caches in resolver)
holdDID, err := atproto.ResolveHoldDID(ctx, holdRef)
if err != nil {
slog.Warn("Backfill failed to pre-resolve hold DID", "hold_ref", holdRef, "error", err)
continue
}
// Pre-cache captain record (skips if cached within last hour)
if err := b.queryCaptainRecord(ctx, holdDID); err != nil {
slog.Warn("Backfill failed to pre-cache captain record", "hold_did", holdDID, "error", err)
}
}
}
// processRecordWith processes a single record using the given processor.
// This allows backfillRepo to use a transactional processor while other callers use the default.
func (b *BackfillWorker) processRecordWith(ctx context.Context, proc *Processor, did, collection string, record *atproto.Record) error {
rkey := extractRkeyFromURI(record.URI)
// For sailor profile collection, we need to pass the queryCaptainFn
// Other collections pass nil
var queryCaptainFn func(context.Context, string) error
if collection == atproto.SailorProfileCollection {
queryCaptainFn = b.queryCaptainRecordWrapper
}
return proc.ProcessRecord(ctx, did, collection, rkey, record.Value, false, queryCaptainFn)
}
// queryCaptainRecordWrapper wraps queryCaptainRecord with backfill-specific logic
func (b *BackfillWorker) queryCaptainRecordWrapper(ctx context.Context, holdDID string) error {
if err := b.queryCaptainRecord(ctx, holdDID); err != nil {
// In test mode, only warn about default hold (local hold)
// External/production holds may not have captain records yet (dev ahead of prod)
if b.testMode && holdDID != b.defaultHoldDID {
// Suppress warning for external holds in test mode
return nil
}
slog.Warn("Backfill failed to query captain record for hold", "hold_did", holdDID, "error", err)
// Don't fail the whole backfill - just skip this hold
return nil
}
return nil
}
// queryCaptainRecord queries a hold's captain record and caches it in the database
func (b *BackfillWorker) queryCaptainRecord(ctx context.Context, holdDID string) error {
// Check if we already have it cached (skip if recently updated)
existing, err := db.GetCaptainRecord(b.db, holdDID)
if err == nil && existing != nil {
// If cached within last hour, skip refresh
if time.Since(existing.UpdatedAt) < 1*time.Hour {
return nil
}
}
// Resolve hold DID to URL
holdURL, err := atproto.ResolveHoldURL(ctx, holdDID)
if err != nil {
return fmt.Errorf("failed to resolve hold URL for %s: %w", holdDID, err)
}
// Create client for hold's PDS
holdClient := atproto.NewClient(holdURL, holdDID, "")
// Query captain record with retries (for Docker startup timing)
var record *atproto.Record
maxRetries := 3
for attempt := 1; attempt <= maxRetries; attempt++ {
record, err = holdClient.GetRecord(ctx, "io.atcr.hold.captain", "self")
if err == nil {
break
}
// Retry on connection errors (hold service might still be starting)
if attempt < maxRetries && strings.Contains(err.Error(), "connection refused") {
slog.Info("Backfill hold not ready, retrying", "attempt", attempt, "max_retries", maxRetries)
time.Sleep(2 * time.Second)
continue
}
return fmt.Errorf("failed to get captain record: %w", err)
}
// Parse captain record directly into db struct
var captainRecord db.HoldCaptainRecord
if err := json.Unmarshal(record.Value, &captainRecord); err != nil {
return fmt.Errorf("failed to parse captain record: %w", err)
}
// Set fields not from JSON
captainRecord.HoldDID = holdDID
captainRecord.UpdatedAt = time.Now()
if err := db.UpsertCaptainRecord(b.db, &captainRecord); err != nil {
return fmt.Errorf("failed to cache captain record: %w", err)
}
slog.Info("Backfill cached captain record for hold", "hold_did", holdDID, "owner_did", captainRecord.OwnerDID)
return nil
}
// queryCrewRecords queries a hold's crew records and caches them in the database
// This is necessary for localhost/private holds that aren't discoverable via the relay
func (b *BackfillWorker) queryCrewRecords(ctx context.Context, holdDID string) error {
// Resolve hold DID to URL
holdURL, err := atproto.ResolveHoldURL(ctx, holdDID)
if err != nil {
return fmt.Errorf("failed to resolve hold URL for %s: %w", holdDID, err)
}
// Create client for hold's PDS
holdClient := atproto.NewClient(holdURL, holdDID, "")
var cursor string
recordCount := 0
// Paginate through all crew records
for {
records, nextCursor, err := holdClient.ListRecordsForRepo(ctx, holdDID, atproto.CrewCollection, 100, cursor)
if err != nil {
// If no crew records exist, that's okay
if strings.Contains(err.Error(), "404") || strings.Contains(err.Error(), "RecordNotFound") {
slog.Debug("No crew records found for hold", "hold_did", holdDID)
return nil
}
return fmt.Errorf("failed to list crew records: %w", err)
}
for _, record := range records {
rkey := extractRkeyFromURI(record.URI)
if err := b.processor.ProcessCrew(ctx, holdDID, rkey, record.Value); err != nil {
slog.Warn("Backfill failed to process crew record", "hold_did", holdDID, "uri", record.URI, "error", err)
continue
}
recordCount++
}
if nextCursor == "" {
break
}
cursor = nextCursor
}
if recordCount > 0 {
slog.Info("Backfill cached crew records for hold", "hold_did", holdDID, "count", recordCount)
}
return nil
}
// reconcileAnnotations ensures annotations come from the newest manifest in each repository
// This fixes the out-of-order backfill issue where older manifests can overwrite newer annotations
func (b *BackfillWorker) reconcileAnnotations(ctx context.Context, did string, pdsClient *atproto.Client) error {
// Get all repositories for this DID
repositories, err := db.GetRepositoriesForDID(b.db, did)
if err != nil {
return fmt.Errorf("failed to get repositories: %w", err)
}
for _, repo := range repositories {
// Find newest manifest for this repository
newestManifest, err := db.GetNewestManifestForRepo(b.db, did, repo)
if err != nil {
slog.Warn("Backfill failed to get newest manifest for repo", "did", did, "repository", repo, "error", err)
continue // Skip on error
}
// Fetch the full manifest record from PDS using the digest as rkey
rkey := strings.TrimPrefix(newestManifest.Digest, "sha256:")
record, err := pdsClient.GetRecord(ctx, atproto.ManifestCollection, rkey)
if err != nil {
slog.Warn("Backfill failed to fetch manifest record for repo", "did", did, "repository", repo, "error", err)
continue // Skip on error
}
// Parse manifest record
var manifestRecord atproto.ManifestRecord
if err := json.Unmarshal(record.Value, &manifestRecord); err != nil {
slog.Warn("Backfill failed to parse manifest record for repo", "did", did, "repository", repo, "error", err)
continue
}
// Update annotations from newest manifest only
if len(manifestRecord.Annotations) > 0 {
// Filter out empty annotations
hasData := false
for _, value := range manifestRecord.Annotations {
if value != "" {
hasData = true
break
}
}
if hasData {
err = db.UpsertRepositoryAnnotations(b.db, did, repo, manifestRecord.Annotations)
if err != nil {
slog.Warn("Backfill failed to reconcile annotations for repo", "did", did, "repository", repo, "error", err)
} else {
slog.Info("Backfill reconciled annotations for repo from newest manifest", "did", did, "repository", repo, "digest", newestManifest.Digest)
}
}
}
}
return nil
}
// reconcileRepoPageDescriptions fetches README content from external sources for repo pages with empty descriptions
// If the user has an OAuth session, it updates the PDS record (source of truth)
// Otherwise, it just stores the fetched content in the database
func (b *BackfillWorker) reconcileRepoPageDescriptions(ctx context.Context, did, pdsEndpoint string) error {
// Get all repo pages for this DID
repoPages, err := db.GetRepoPagesByDID(b.db, did)
if err != nil {
return fmt.Errorf("failed to get repo pages: %w", err)
}
for _, page := range repoPages {
// Skip pages that were manually edited by the user or already have a description
if page.UserEdited || page.Description != "" {
continue
}
// Get annotations from the repository's manifest
annotations, err := db.GetRepositoryAnnotations(b.db, did, page.Repository)
if err != nil {
slog.Debug("Failed to get annotations for repo page", "did", did, "repository", page.Repository, "error", err)
continue
}
// Try to fetch README content from external sources
description := b.fetchReadmeContent(ctx, annotations)
if description == "" {
// No README content available, skip
continue
}
slog.Info("Fetched README for repo page", "did", did, "repository", page.Repository, "descriptionLength", len(description))
// Try to update PDS if we have OAuth session
pdsUpdated := false
if b.refresher != nil {
if err := b.updateRepoPageInPDS(ctx, did, pdsEndpoint, page.Repository, description, page.AvatarCID); err != nil {
slog.Debug("Could not update repo page in PDS, falling back to DB-only", "did", did, "repository", page.Repository, "error", err)
} else {
pdsUpdated = true
slog.Info("Updated repo page in PDS with fetched description", "did", did, "repository", page.Repository)
}
}
// Always update database with the fetched content
if err := db.UpsertRepoPage(b.db, did, page.Repository, description, page.AvatarCID, false, page.CreatedAt, time.Now()); err != nil {
slog.Warn("Failed to update repo page in database", "did", did, "repository", page.Repository, "error", err)
} else if !pdsUpdated {
slog.Info("Updated repo page in database (PDS not updated)", "did", did, "repository", page.Repository)
}
}
return nil
}
// fetchReadmeContent attempts to fetch README content from external sources based on annotations
// Priority: io.atcr.readme annotation > derived from org.opencontainers.image.source
func (b *BackfillWorker) fetchReadmeContent(ctx context.Context, annotations map[string]string) string {
// Create a context with timeout for README fetching
fetchCtx, cancel := context.WithTimeout(ctx, 10*time.Second)
defer cancel()
// Priority 1: Direct README URL from io.atcr.readme annotation
if readmeURL := annotations["io.atcr.readme"]; readmeURL != "" {
content, err := b.fetchRawReadme(fetchCtx, readmeURL)
if err != nil {
slog.Debug("Failed to fetch README from io.atcr.readme annotation", "url", readmeURL, "error", err)
} else if content != "" {
return content
}
}
// Priority 2: Derive README URL from org.opencontainers.image.source
if sourceURL := annotations["org.opencontainers.image.source"]; sourceURL != "" {
// Try main branch first, then master
for _, branch := range []string{"main", "master"} {
readmeURL := readme.DeriveReadmeURL(sourceURL, branch)
if readmeURL == "" {
continue
}
content, err := b.fetchRawReadme(fetchCtx, readmeURL)
if err != nil {
// Only log non-404 errors (404 is expected when trying main vs master)
if !readme.Is404(err) {
slog.Debug("Failed to fetch README from source URL", "url", readmeURL, "branch", branch, "error", err)
}
continue
}
if content != "" {
return content
}
}
}
return ""
}
// fetchRawReadme fetches raw markdown content from a URL
func (b *BackfillWorker) fetchRawReadme(ctx context.Context, readmeURL string) (string, error) {
req, err := http.NewRequestWithContext(ctx, "GET", readmeURL, nil)
if err != nil {
return "", fmt.Errorf("failed to create request: %w", err)
}
req.Header.Set("User-Agent", "ATCR-Backfill-README-Fetcher/1.0")
client := &http.Client{
Timeout: 10 * time.Second,
CheckRedirect: func(req *http.Request, via []*http.Request) error {
if len(via) >= 5 {
return fmt.Errorf("too many redirects")
}
return nil
},
}
resp, err := client.Do(req)
if err != nil {
return "", fmt.Errorf("failed to fetch URL: %w", err)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
return "", fmt.Errorf("status %d", resp.StatusCode)
}
// Limit content size to 100KB
limitedReader := io.LimitReader(resp.Body, 100*1024)
content, err := io.ReadAll(limitedReader)
if err != nil {
return "", fmt.Errorf("failed to read response body: %w", err)
}
return string(content), nil
}
// updateRepoPageInPDS updates the repo page record in the user's PDS using OAuth
func (b *BackfillWorker) updateRepoPageInPDS(ctx context.Context, did, pdsEndpoint, repository, description, avatarCID string) error {
if b.refresher == nil {
return fmt.Errorf("no OAuth refresher available")
}
// Create ATProto client with session provider
pdsClient := atproto.NewClientWithSessionProvider(pdsEndpoint, did, b.refresher)
// Get existing repo page record to preserve other fields
existingRecord, err := pdsClient.GetRecord(ctx, atproto.RepoPageCollection, repository)
var createdAt time.Time
var avatarRef *atproto.ATProtoBlobRef
if err != nil && !errors.Is(err, atproto.ErrRecordNotFound) {
// Non-404 error (e.g., no OAuth session) - fail fast instead of trying PutRecord
return fmt.Errorf("failed to check existing record: %w", err)
}
if err == nil && existingRecord != nil {
// Parse existing record
var existingPage atproto.RepoPageRecord
if err := json.Unmarshal(existingRecord.Value, &existingPage); err == nil {
createdAt = existingPage.CreatedAt
avatarRef = existingPage.Avatar
}
}
if createdAt.IsZero() {
createdAt = time.Now()
}
// Create updated repo page record
repoPage := &atproto.RepoPageRecord{
Type: atproto.RepoPageCollection,
Repository: repository,
Description: description,
Avatar: avatarRef,
CreatedAt: createdAt,
UpdatedAt: time.Now(),
}
// Write to PDS - this will use DoWithSession internally
_, err = pdsClient.PutRecord(ctx, atproto.RepoPageCollection, repository, repoPage)
if err != nil {
return fmt.Errorf("failed to write to PDS: %w", err)
}
return nil
}
// extractRkeyFromURI extracts the rkey from an AT-URI
// Format: at://did/collection/rkey
func extractRkeyFromURI(uri string) string {
// URI format: at://did/collection/rkey
parts := strings.Split(uri, "/")
if len(parts) >= 5 {
return parts[4]
}
return ""
}