mirror of
https://tangled.org/evan.jarrett.net/at-container-registry
synced 2026-04-20 08:30:29 +00:00
improve backfill and jetstream db connections
This commit is contained in:
@@ -92,6 +92,16 @@ func InitDB(path string, cfg LibsqlConfig) (*sql.DB, error) {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// Bound the connection pool. With a remote target (Bunny Database), each
|
||||
// idle conn is a stable libsql stream — keeping a handful warm avoids
|
||||
// reconnect cost, capping the total prevents runaway contention. Short
|
||||
// lifetimes ensure we recycle past any idle-side disconnects and drop any
|
||||
// poisoned conn that survived IsPoisonedTxErr eviction.
|
||||
db.SetMaxOpenConns(8)
|
||||
db.SetMaxIdleConns(4)
|
||||
db.SetConnMaxLifetime(5 * time.Minute)
|
||||
db.SetConnMaxIdleTime(2 * time.Minute)
|
||||
|
||||
// Check if this is an existing database with migrations applied
|
||||
isExisting, err := hasAppliedMigrations(db)
|
||||
if err != nil {
|
||||
@@ -202,20 +212,21 @@ func runMigrations(db *sql.DB, freshDB bool) error {
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to begin transaction for migration %d: %w", m.Version, err)
|
||||
}
|
||||
// Deferred rollback is a no-op once Commit succeeds; it guards against
|
||||
// panics and any early return that forgets an explicit rollback.
|
||||
defer func() { _ = tx.Rollback() }()
|
||||
|
||||
// Split query into individual statements and execute each
|
||||
// go-sqlite3's Exec() doesn't reliably execute all statements in multi-statement queries
|
||||
statements := splitSQLStatements(m.Query)
|
||||
for i, stmt := range statements {
|
||||
if _, err := tx.Exec(stmt); err != nil {
|
||||
tx.Rollback()
|
||||
return fmt.Errorf("failed to apply migration %d (%s) statement %d: %w", m.Version, m.Name, i+1, err)
|
||||
}
|
||||
}
|
||||
|
||||
// Record migration
|
||||
if _, err := tx.Exec("INSERT INTO schema_migrations (version) VALUES (?)", m.Version); err != nil {
|
||||
tx.Rollback()
|
||||
return fmt.Errorf("failed to record migration %d: %w", m.Version, err)
|
||||
}
|
||||
|
||||
|
||||
@@ -180,6 +180,12 @@ CREATE TABLE IF NOT EXISTS repository_stats_daily (
|
||||
);
|
||||
CREATE INDEX IF NOT EXISTS idx_repo_stats_daily_date ON repository_stats_daily(date DESC);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS jetstream_cursor (
|
||||
id INTEGER PRIMARY KEY CHECK (id = 1),
|
||||
cursor INTEGER NOT NULL,
|
||||
updated_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS stars (
|
||||
starrer_did TEXT NOT NULL,
|
||||
owner_did TEXT NOT NULL,
|
||||
|
||||
@@ -267,36 +267,51 @@ func (b *BackfillWorker) backfillRepo(ctx context.Context, did, collection strin
|
||||
// ProcessManifest calls ResolveHoldDID for legacy manifests.
|
||||
b.prewarmHoldCaches(ctx, collection, allRecords)
|
||||
|
||||
// Phase 3: Process records in chunked transactions.
|
||||
// All network I/O should be cached by now, so transactions stay fast.
|
||||
const chunkSize = 20
|
||||
recordCount := 0
|
||||
|
||||
for i := 0; i < len(allRecords); i += chunkSize {
|
||||
end := i + chunkSize
|
||||
if end > len(allRecords) {
|
||||
end = len(allRecords)
|
||||
}
|
||||
|
||||
tx, err := b.db.Begin()
|
||||
if err != nil {
|
||||
return recordCount, fmt.Errorf("failed to begin transaction: %w", err)
|
||||
}
|
||||
|
||||
txProcessor := NewProcessor(tx, false, b.processor.statsCache)
|
||||
|
||||
for j := i; j < end; j++ {
|
||||
if err := b.processRecordWith(ctx, txProcessor, did, collection, &allRecords[j]); err != nil {
|
||||
slog.Warn("Backfill failed to process record", "uri", allRecords[j].URI, "error", err)
|
||||
// Phase 3: Write records to the DB.
|
||||
//
|
||||
// For collections whose writes are straightforward idempotent upserts, we
|
||||
// batch every record in the repo into one multi-row INSERT per table. This
|
||||
// replaces the previous 20-record chunked transaction loop, which exceeded
|
||||
// Bunny Database's remote transaction timeout (~5s) once chunks grew large
|
||||
// and poisoned the connection pool on timeout.
|
||||
//
|
||||
// Collections that do network I/O per record (SailorProfile) or have
|
||||
// conditional read-then-write logic (Scan) stay on the single-record path
|
||||
// where each write is its own statement and cannot hold a long transaction.
|
||||
var recordCount int
|
||||
var procErr error
|
||||
switch collection {
|
||||
case atproto.ManifestCollection:
|
||||
recordCount, procErr = b.batchManifests(ctx, did, allRecords)
|
||||
case atproto.TagCollection:
|
||||
recordCount, procErr = b.batchTags(did, allRecords)
|
||||
case atproto.StarCollection:
|
||||
recordCount, procErr = b.batchStars(ctx, did, allRecords)
|
||||
case atproto.RepoPageCollection:
|
||||
recordCount, procErr = b.batchRepoPages(did, allRecords)
|
||||
case atproto.DailyStatsCollection:
|
||||
recordCount, procErr = b.batchDailyStats(ctx, did, allRecords)
|
||||
case atproto.StatsCollection:
|
||||
recordCount, procErr = b.batchStats(ctx, did, allRecords)
|
||||
case atproto.CaptainCollection:
|
||||
recordCount, procErr = b.batchCaptains(did, allRecords)
|
||||
case atproto.CrewCollection:
|
||||
recordCount, procErr = b.batchCrew(did, allRecords)
|
||||
default:
|
||||
// SailorProfileCollection and ScanCollection keep per-record processing
|
||||
// because they do network I/O or conditional reads that would be awkward
|
||||
// to batch. Each call writes a single row, so there is no long-lived
|
||||
// transaction at risk.
|
||||
for i := range allRecords {
|
||||
if err := b.processRecordWith(ctx, b.processor, did, collection, &allRecords[i]); err != nil {
|
||||
slog.Warn("Backfill failed to process record", "uri", allRecords[i].URI, "error", err)
|
||||
continue
|
||||
}
|
||||
recordCount++
|
||||
}
|
||||
|
||||
if err := tx.Commit(); err != nil {
|
||||
tx.Rollback()
|
||||
return recordCount, fmt.Errorf("failed to commit transaction: %w", err)
|
||||
}
|
||||
}
|
||||
if procErr != nil {
|
||||
return recordCount, procErr
|
||||
}
|
||||
|
||||
// Reconciliation runs outside the transaction (involves network I/O and fewer writes)
|
||||
|
||||
@@ -49,6 +49,12 @@ type Worker struct {
|
||||
// In-memory cursor tracking for reconnects
|
||||
lastCursor int64
|
||||
cursorMutex sync.RWMutex
|
||||
|
||||
// Cursor persistence: a single-slot channel carries the most recent
|
||||
// cursor to a background saver goroutine. The saver writes to
|
||||
// jetstream_cursor every tick, dropping any older value that has not
|
||||
// yet been flushed so the WS read loop is never blocked on DB I/O.
|
||||
cursorSave chan int64
|
||||
}
|
||||
|
||||
// NewWorker creates a new Jetstream worker
|
||||
@@ -74,6 +80,7 @@ func NewWorker(database *sql.DB, urls []string, startCursor int64) *Worker {
|
||||
},
|
||||
statsCache: statsCache,
|
||||
processor: NewProcessor(database, true, statsCache), // Use cache for live streaming
|
||||
cursorSave: make(chan int64, 1),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -258,7 +265,7 @@ func (w *Worker) Start(ctx context.Context) error {
|
||||
// TODO: Re-enable compression once debugging is complete
|
||||
_ = decoder // Keep decoder to avoid unused variable error
|
||||
|
||||
if err := w.processMessage(message); err != nil {
|
||||
if err := w.processMessageResilient(ctx, message); err != nil {
|
||||
slog.Error("ERROR processing message", "error", err)
|
||||
// Continue processing other messages
|
||||
} else {
|
||||
@@ -274,6 +281,37 @@ func (w *Worker) Start(ctx context.Context) error {
|
||||
// 30 seconds to avoid missing events (events are idempotent DB upserts).
|
||||
// Cycles through all endpoints indefinitely and never gives up.
|
||||
func (w *Worker) StartWithFailover(ctx context.Context) {
|
||||
// Bootstrap from the persisted cursor the first time we run. If the DB
|
||||
// has a saved cursor we resume from it (minus a small safety rewind so
|
||||
// any gap from the previous shutdown is covered). Events are idempotent
|
||||
// UPSERTs, so re-processing a handful is harmless.
|
||||
if w.startCursor == 0 {
|
||||
if cursor, err := db.GetJetstreamCursor(w.db); err != nil {
|
||||
slog.Warn("Jetstream failed to load persisted cursor", "error", err)
|
||||
} else if cursor > 0 {
|
||||
const rewind = int64(30 * 1_000_000) // 30s safety rewind, same units as cursor
|
||||
resume := cursor - rewind
|
||||
if resume < 0 {
|
||||
resume = 0
|
||||
}
|
||||
w.cursorMutex.Lock()
|
||||
w.startCursor = resume
|
||||
w.lastCursor = resume
|
||||
w.cursorMutex.Unlock()
|
||||
slog.Info("Jetstream resuming from persisted cursor",
|
||||
"persisted_cursor", cursor,
|
||||
"resume_cursor", resume)
|
||||
}
|
||||
}
|
||||
|
||||
// Launch the background cursor saver. It runs for the lifetime of this
|
||||
// Start call and exits on ctx.Done with a final flush.
|
||||
saverDone := make(chan struct{})
|
||||
go w.runCursorSaver(ctx, saverDone)
|
||||
defer func() {
|
||||
<-saverDone
|
||||
}()
|
||||
|
||||
retryDelays := []time.Duration{1 * time.Second, 5 * time.Second, 10 * time.Second}
|
||||
|
||||
for {
|
||||
@@ -358,6 +396,49 @@ func (w *Worker) Processor() *Processor {
|
||||
return w.processor
|
||||
}
|
||||
|
||||
// runCursorSaver is a long-running goroutine that persists the most recent
|
||||
// Jetstream cursor to SQLite. It writes at most once every cursorSaveInterval
|
||||
// so we never hit the DB faster than it can keep up, and always flushes a
|
||||
// final value on shutdown so the next Start resumes from the right place.
|
||||
//
|
||||
// The goroutine intentionally uses b.db directly (not ExecResilient) because
|
||||
// the INSERT ... ON CONFLICT statement is a single round-trip that cannot
|
||||
// trigger the poisoned-tx cascade.
|
||||
func (w *Worker) runCursorSaver(ctx context.Context, done chan<- struct{}) {
|
||||
defer close(done)
|
||||
|
||||
const cursorSaveInterval = 5 * time.Second
|
||||
ticker := time.NewTicker(cursorSaveInterval)
|
||||
defer ticker.Stop()
|
||||
|
||||
var pending int64
|
||||
flush := func() {
|
||||
if pending == 0 {
|
||||
return
|
||||
}
|
||||
if err := db.SaveJetstreamCursor(w.db, pending); err != nil {
|
||||
slog.Warn("Jetstream failed to persist cursor", "cursor", pending, "error", err)
|
||||
return
|
||||
}
|
||||
pending = 0
|
||||
}
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
flush()
|
||||
return
|
||||
case c := <-w.cursorSave:
|
||||
// Keep only the newest value; the ticker decides when to flush.
|
||||
if c > pending {
|
||||
pending = c
|
||||
}
|
||||
case <-ticker.C:
|
||||
flush()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// GetLastCursor returns the last processed cursor (time_us) for reconnects
|
||||
func (w *Worker) GetLastCursor() int64 {
|
||||
w.cursorMutex.RLock()
|
||||
@@ -365,6 +446,49 @@ func (w *Worker) GetLastCursor() int64 {
|
||||
return w.lastCursor
|
||||
}
|
||||
|
||||
// processMessageResilient runs processMessage and, if the underlying DB
|
||||
// connection was poisoned by a remote tx timeout (common with Bunny Database
|
||||
// after a backfill chunk exceeded the server-side transaction limit), drains
|
||||
// the poisoned connections from the pool and retries once. A second failure
|
||||
// returns the error so the caller's Error log line fires — replacing silent
|
||||
// data loss with a loud, attributable one.
|
||||
func (w *Worker) processMessageResilient(ctx context.Context, message []byte) error {
|
||||
err := w.processMessage(message)
|
||||
if err == nil || !db.IsPoisonedTxErr(err) {
|
||||
return err
|
||||
}
|
||||
|
||||
slog.Warn("Jetstream poisoned connection detected, draining pool and retrying",
|
||||
"error", err)
|
||||
drainPool(ctx, w.db)
|
||||
time.Sleep(100 * time.Millisecond)
|
||||
return w.processMessage(message)
|
||||
}
|
||||
|
||||
// drainPool borrows each idle connection from the pool in turn and runs a
|
||||
// trivial probe. A poisoned connection fails the probe, and db.ExecResilient
|
||||
// evicts it via driver.ErrBadConn. Loops up to the pool's open-connection
|
||||
// limit so a single call can clear every bad conn.
|
||||
func drainPool(ctx context.Context, database *sql.DB) {
|
||||
// MaxOpenConns is 8 (see pkg/appview/db/schema.go). We probe one more time
|
||||
// than that to ensure we cycle through every conn if any were mid-use.
|
||||
const maxProbes = 10
|
||||
for i := 0; i < maxProbes; i++ {
|
||||
err := db.ExecResilient(ctx, database, func(conn *sql.Conn) error {
|
||||
_, err := conn.ExecContext(ctx, "SELECT 1")
|
||||
return err
|
||||
})
|
||||
if err == nil {
|
||||
// Got a healthy conn; no need to probe further — any remaining
|
||||
// poisoned conns will be evicted on their next use.
|
||||
return
|
||||
}
|
||||
if ctx.Err() != nil {
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// processMessage processes a single Jetstream event
|
||||
func (w *Worker) processMessage(message []byte) error {
|
||||
var event JetstreamEvent
|
||||
@@ -377,6 +501,26 @@ func (w *Worker) processMessage(message []byte) error {
|
||||
w.lastCursor = event.TimeUS
|
||||
w.cursorMutex.Unlock()
|
||||
|
||||
// Offer the cursor to the async saver. Non-blocking: if the saver is
|
||||
// still writing the previous value, we drop-and-replace so the DB always
|
||||
// converges on the freshest cursor without ever stalling the read loop.
|
||||
if w.cursorSave != nil {
|
||||
select {
|
||||
case w.cursorSave <- event.TimeUS:
|
||||
default:
|
||||
// Drain any stale value and try once more — if that still fails
|
||||
// we just skip this tick; the saver's timer will catch up.
|
||||
select {
|
||||
case <-w.cursorSave:
|
||||
default:
|
||||
}
|
||||
select {
|
||||
case w.cursorSave <- event.TimeUS:
|
||||
default:
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Call callback if set
|
||||
if w.eventCallback != nil {
|
||||
w.eventCallback(event.TimeUS)
|
||||
|
||||
Reference in New Issue
Block a user