at-container-registry/scanner/internal/scan/worker.go

// Package scan implements the vulnerability scanning pipeline:
// extract layers → generate SBOM → scan vulnerabilities → send result.
package scan

import (
	"context"
	"fmt"
	"log/slog"
	"os"
	"runtime"
	"strings"
	"sync"
	"time"

	scanner "atcr.io/scanner"
	"atcr.io/scanner/internal/client"
	"atcr.io/scanner/internal/config"
	"atcr.io/scanner/internal/queue"
)

// WorkerPool manages a pool of scan workers
type WorkerPool struct {
	cfg    *config.Config
	queue  *queue.JobQueue
	client *client.HoldClient
	wg     sync.WaitGroup
}

// NewWorkerPool creates a new worker pool
func NewWorkerPool(cfg *config.Config, q *queue.JobQueue, c *client.HoldClient) *WorkerPool {
	return &WorkerPool{
		cfg:    cfg,
		queue:  q,
		client: c,
	}
}

// Start launches worker goroutines
func (wp *WorkerPool) Start(ctx context.Context) {
	// Point TMPDIR at the configured tmp dir so Grype's DB download
	// (go-getter zstd decompression can be 1 GB+) and stereoscope's layer
	// extraction both land on the same partition as the scanner volume —
	// NOT on /tmp, which is typically tmpfs with ~400 MB and would silently
	// fail mid-extract. This must be set before any scanner/grype goroutine
	// starts and must never be restored to a smaller default mid-process.
	if wp.cfg.Vuln.TmpDir != "" {
		if err := os.MkdirAll(wp.cfg.Vuln.TmpDir, 0o755); err != nil {
			slog.Warn("Failed to create scanner tmp dir", "path", wp.cfg.Vuln.TmpDir, "error", err)
		}
		os.Setenv("TMPDIR", wp.cfg.Vuln.TmpDir)
	}

	// Initialize vuln database on startup if enabled
	if wp.cfg.Vuln.Enabled {
		go func() {
			if err := initializeVulnDatabase(wp.cfg.Vuln.DBPath); err != nil {
				slog.Error("Failed to initialize vulnerability database", "error", err)
				slog.Warn("Vulnerability scanning will be disabled until database is available")
			}
		}()
	}

	for i := 0; i < wp.cfg.Scanner.Workers; i++ {
		wp.wg.Add(1)
		go wp.worker(ctx, i)
	}

	slog.Info("Scanner worker pool started", "workers", wp.cfg.Scanner.Workers)
}

// Wait blocks until all workers finish
func (wp *WorkerPool) Wait() {
	wp.wg.Wait()
}

func (wp *WorkerPool) worker(ctx context.Context, id int) {
	defer wp.wg.Done()

	slog.Info("Scanner worker started", "worker_id", id)

	for {
		job := wp.queue.Dequeue()
		if job == nil {
			slog.Info("Scanner worker shutting down", "worker_id", id)
			return
		}

		slog.Info("Processing scan job",
			"worker_id", id,
			"repository", job.Repository,
			"tag", job.Tag,
			"digest", job.ManifestDigest,
			"tier", job.Tier)

		result, err := wp.processJob(ctx, job)
		if err != nil {
			logLevel := slog.LevelError
			if strings.HasPrefix(err.Error(), "skipped:") {
				logLevel = slog.LevelInfo
			}
			slog.Log(ctx, logLevel, "Scan job failed",
				"worker_id", id,
				"repository", job.Repository,
				"error", err)
			wp.client.SendError(job.Seq, err.Error())
		} else {
			wp.client.SendResult(job.Seq, result)

			slog.Info("Scan job completed",
				"worker_id", id,
				"repository", job.Repository,
				"vulnerabilities", result.Summary.Total)
		}

		// Free large scan artifacts and trigger GC before the cooldown
		// so memory is reclaimed between jobs. Syft/Grype allocate heavily
		// and Go's GC needs idle time to catch up under sustained load.
		result = nil
		runtime.GC()

		// Cooldown between scans to reduce sustained memory pressure
		select {
		case <-ctx.Done():
			return
		case <-time.After(10 * time.Second):
		}
	}
}

// unscannable config media types — these are OCI artifacts that aren't
// container images so Syft/Grype can't analyze their layers.
var unscannableConfigTypes = map[string]bool{
	"application/vnd.cncf.helm.config.v1+json": true, // Helm charts
	"application/vnd.in-toto+json":             true, // In-toto attestations
	"application/vnd.dsse.envelope.v1+json":    true, // DSSE envelopes (SLSA)
}

func (wp *WorkerPool) processJob(ctx context.Context, job *scanner.ScanJob) (*scanner.ScanResult, error) {
	startTime := time.Now()

	// Skip non-container OCI artifacts (Helm charts, WASM modules, etc.)
	if unscannableConfigTypes[job.Config.MediaType] {
		return nil, fmt.Errorf("skipped: unscannable artifact type %s", job.Config.MediaType)
	}

	// Ensure tmp dir exists
	if err := ensureDir(wp.cfg.Vuln.TmpDir); err != nil {
		return nil, fmt.Errorf("failed to create tmp dir: %w", err)
	}

	// Check total compressed image size before downloading
	if wp.cfg.Vuln.MaxImageSize > 0 {
		var totalSize int64
		for _, layer := range job.Layers {
			totalSize += layer.Size
		}
		totalSize += job.Config.Size
		if totalSize > wp.cfg.Vuln.MaxImageSize {
			return nil, fmt.Errorf("image too large: %d bytes compressed (limit %d bytes)", totalSize, wp.cfg.Vuln.MaxImageSize)
		}
	}

	// Step 1: Build OCI image layout from hold via presigned URLs
	slog.Info("Building OCI layout", "repository", job.Repository)
	ociLayoutDir, cleanup, err := buildOCILayout(job, wp.cfg.Vuln.TmpDir, wp.cfg.Hold.Secret)
	if err != nil {
		return nil, fmt.Errorf("failed to build OCI layout: %w", err)
	}
	defer cleanup()

	// Step 2: Generate SBOM with Syft
	slog.Info("Generating SBOM", "repository", job.Repository)
	sbomResult, sbomJSON, sbomDigest, err := generateSBOM(ctx, ociLayoutDir)
	if err != nil {
		return nil, fmt.Errorf("failed to generate SBOM: %w", err)
	}

	result := &scanner.ScanResult{
		ManifestDigest: job.ManifestDigest,
		SBOM:           sbomJSON,
		SBOMDigest:     sbomDigest,
	}

	// Step 3: Scan SBOM with Grype (if enabled)
	if wp.cfg.Vuln.Enabled {
		slog.Info("Scanning for vulnerabilities", "repository", job.Repository, "handle", job.UserHandle)
		vulnJSON, vulnDigest, summary, err := scanVulnerabilities(ctx, sbomResult, wp.cfg.Vuln.DBPath)
		if err != nil {
			return nil, fmt.Errorf("failed to scan vulnerabilities: %w", err)
		}
		result.VulnReport = vulnJSON
		result.VulnDigest = vulnDigest
		result.Summary = &summary
	}
	sbomResult = nil // release SBOM catalog for GC

	duration := time.Since(startTime)
	slog.Info("Scan pipeline completed",
		"repository", job.Repository,
		"duration", duration)

	return result, nil
}

func ensureDir(path string) error {
	return os.MkdirAll(path, 0755)
}