Files
at-container-registry/scanner/internal/scan/worker.go
2026-04-12 20:48:24 -05:00

208 lines
6.0 KiB
Go

// Package scan implements the vulnerability scanning pipeline:
// extract layers → generate SBOM → scan vulnerabilities → send result.
package scan
import (
"context"
"fmt"
"log/slog"
"os"
"runtime"
"strings"
"sync"
"time"
scanner "atcr.io/scanner"
"atcr.io/scanner/internal/client"
"atcr.io/scanner/internal/config"
"atcr.io/scanner/internal/queue"
)
// WorkerPool manages a pool of scan workers
type WorkerPool struct {
cfg *config.Config
queue *queue.JobQueue
client *client.HoldClient
wg sync.WaitGroup
}
// NewWorkerPool creates a new worker pool
func NewWorkerPool(cfg *config.Config, q *queue.JobQueue, c *client.HoldClient) *WorkerPool {
return &WorkerPool{
cfg: cfg,
queue: q,
client: c,
}
}
// Start launches worker goroutines
func (wp *WorkerPool) Start(ctx context.Context) {
// Point TMPDIR at the configured tmp dir so Grype's DB download
// (go-getter zstd decompression can be 1 GB+) and stereoscope's layer
// extraction both land on the same partition as the scanner volume —
// NOT on /tmp, which is typically tmpfs with ~400 MB and would silently
// fail mid-extract. This must be set before any scanner/grype goroutine
// starts and must never be restored to a smaller default mid-process.
if wp.cfg.Vuln.TmpDir != "" {
if err := os.MkdirAll(wp.cfg.Vuln.TmpDir, 0o755); err != nil {
slog.Warn("Failed to create scanner tmp dir", "path", wp.cfg.Vuln.TmpDir, "error", err)
}
os.Setenv("TMPDIR", wp.cfg.Vuln.TmpDir)
}
// Initialize vuln database on startup if enabled
if wp.cfg.Vuln.Enabled {
go func() {
if err := initializeVulnDatabase(wp.cfg.Vuln.DBPath); err != nil {
slog.Error("Failed to initialize vulnerability database", "error", err)
slog.Warn("Vulnerability scanning will be disabled until database is available")
}
}()
}
for i := 0; i < wp.cfg.Scanner.Workers; i++ {
wp.wg.Add(1)
go wp.worker(ctx, i)
}
slog.Info("Scanner worker pool started", "workers", wp.cfg.Scanner.Workers)
}
// Wait blocks until all workers finish
func (wp *WorkerPool) Wait() {
wp.wg.Wait()
}
func (wp *WorkerPool) worker(ctx context.Context, id int) {
defer wp.wg.Done()
slog.Info("Scanner worker started", "worker_id", id)
for {
job := wp.queue.Dequeue()
if job == nil {
slog.Info("Scanner worker shutting down", "worker_id", id)
return
}
slog.Info("Processing scan job",
"worker_id", id,
"repository", job.Repository,
"tag", job.Tag,
"digest", job.ManifestDigest,
"tier", job.Tier)
result, err := wp.processJob(ctx, job)
if err != nil {
logLevel := slog.LevelError
if strings.HasPrefix(err.Error(), "skipped:") {
logLevel = slog.LevelInfo
}
slog.Log(ctx, logLevel, "Scan job failed",
"worker_id", id,
"repository", job.Repository,
"error", err)
wp.client.SendError(job.Seq, err.Error())
} else {
wp.client.SendResult(job.Seq, result)
slog.Info("Scan job completed",
"worker_id", id,
"repository", job.Repository,
"vulnerabilities", result.Summary.Total)
}
// Free large scan artifacts and trigger GC before the cooldown
// so memory is reclaimed between jobs. Syft/Grype allocate heavily
// and Go's GC needs idle time to catch up under sustained load.
result = nil
runtime.GC()
// Cooldown between scans to reduce sustained memory pressure
select {
case <-ctx.Done():
return
case <-time.After(10 * time.Second):
}
}
}
// unscannable config media types — these are OCI artifacts that aren't
// container images so Syft/Grype can't analyze their layers.
var unscannableConfigTypes = map[string]bool{
"application/vnd.cncf.helm.config.v1+json": true, // Helm charts
"application/vnd.in-toto+json": true, // In-toto attestations
"application/vnd.dsse.envelope.v1+json": true, // DSSE envelopes (SLSA)
}
func (wp *WorkerPool) processJob(ctx context.Context, job *scanner.ScanJob) (*scanner.ScanResult, error) {
startTime := time.Now()
// Skip non-container OCI artifacts (Helm charts, WASM modules, etc.)
if unscannableConfigTypes[job.Config.MediaType] {
return nil, fmt.Errorf("skipped: unscannable artifact type %s", job.Config.MediaType)
}
// Ensure tmp dir exists
if err := ensureDir(wp.cfg.Vuln.TmpDir); err != nil {
return nil, fmt.Errorf("failed to create tmp dir: %w", err)
}
// Check total compressed image size before downloading
if wp.cfg.Vuln.MaxImageSize > 0 {
var totalSize int64
for _, layer := range job.Layers {
totalSize += layer.Size
}
totalSize += job.Config.Size
if totalSize > wp.cfg.Vuln.MaxImageSize {
return nil, fmt.Errorf("image too large: %d bytes compressed (limit %d bytes)", totalSize, wp.cfg.Vuln.MaxImageSize)
}
}
// Step 1: Build OCI image layout from hold via presigned URLs
slog.Info("Building OCI layout", "repository", job.Repository)
ociLayoutDir, cleanup, err := buildOCILayout(job, wp.cfg.Vuln.TmpDir, wp.cfg.Hold.Secret)
if err != nil {
return nil, fmt.Errorf("failed to build OCI layout: %w", err)
}
defer cleanup()
// Step 2: Generate SBOM with Syft
slog.Info("Generating SBOM", "repository", job.Repository)
sbomResult, sbomJSON, sbomDigest, err := generateSBOM(ctx, ociLayoutDir)
if err != nil {
return nil, fmt.Errorf("failed to generate SBOM: %w", err)
}
result := &scanner.ScanResult{
ManifestDigest: job.ManifestDigest,
SBOM: sbomJSON,
SBOMDigest: sbomDigest,
}
// Step 3: Scan SBOM with Grype (if enabled)
if wp.cfg.Vuln.Enabled {
slog.Info("Scanning for vulnerabilities", "repository", job.Repository, "handle", job.UserHandle)
vulnJSON, vulnDigest, summary, err := scanVulnerabilities(ctx, sbomResult, wp.cfg.Vuln.DBPath)
if err != nil {
return nil, fmt.Errorf("failed to scan vulnerabilities: %w", err)
}
result.VulnReport = vulnJSON
result.VulnDigest = vulnDigest
result.Summary = &summary
}
sbomResult = nil // release SBOM catalog for GC
duration := time.Since(startTime)
slog.Info("Scan pipeline completed",
"repository", job.Repository,
"duration", duration)
return result, nil
}
func ensureDir(path string) error {
return os.MkdirAll(path, 0755)
}