Files
at-container-registry/pkg/auth/servicetoken.go

362 lines
13 KiB
Go

package auth
import (
"context"
"encoding/json"
"errors"
"fmt"
"io"
"log/slog"
"net/http"
"net/url"
"time"
"atcr.io/pkg/atproto"
"atcr.io/pkg/auth/oauth"
"github.com/bluesky-social/indigo/atproto/atclient"
indigo_oauth "github.com/bluesky-social/indigo/atproto/auth/oauth"
)
// getErrorHint provides context-specific troubleshooting hints based on API error type
func getErrorHint(apiErr *atclient.APIError) string {
switch apiErr.Name {
case "use_dpop_nonce":
return "DPoP nonce mismatch - indigo library should automatically retry with new nonce. If this persists, check for concurrent request issues or PDS session corruption."
case "invalid_client":
if apiErr.Message != "" && apiErr.Message == "Validation of \"client_assertion\" failed: \"iat\" claim timestamp check failed (it should be in the past)" {
return "JWT timestamp validation failed - system clock on AppView may be ahead of PDS clock. Check NTP sync with: timedatectl status"
}
return "OAuth client authentication failed - check client key configuration and PDS OAuth server status"
case "invalid_token", "invalid_grant":
return "OAuth tokens expired or invalidated - user will need to re-authenticate via OAuth flow"
case "server_error":
if apiErr.StatusCode == 500 {
return "PDS returned internal server error - this may occur after repeated DPoP nonce failures or other PDS-side issues. Check PDS logs for root cause."
}
return "PDS server error - check PDS health and logs"
case "invalid_dpop_proof":
return "DPoP proof validation failed - check system clock sync and DPoP key configuration"
default:
if apiErr.StatusCode == 401 || apiErr.StatusCode == 403 {
return "Authentication/authorization failed - OAuth session may be expired or revoked"
}
return "PDS rejected the request - see errorName and errorMessage for details"
}
}
// GetOrFetchServiceToken gets a service token for hold authentication.
// Checks cache first, then fetches from PDS with OAuth/DPoP if needed.
// This is the canonical implementation used by both middleware and crew registration.
//
// IMPORTANT: Uses DoWithSession() to hold a per-DID lock through the entire PDS interaction.
// This prevents DPoP nonce race conditions when multiple Docker layers upload concurrently.
func GetOrFetchServiceToken(
ctx context.Context,
refresher *oauth.Refresher,
did, holdDID, pdsEndpoint string,
) (string, error) {
if refresher == nil {
return "", fmt.Errorf("refresher is nil (OAuth session required for service tokens)")
}
// Check cache first to avoid unnecessary PDS calls on every request
cachedToken, expiresAt := GetServiceToken(did, holdDID)
// Use cached token if it exists and has > 10s remaining
if cachedToken != "" && time.Until(expiresAt) > 10*time.Second {
slog.Debug("Using cached service token",
"did", did,
"expiresIn", time.Until(expiresAt).Round(time.Second))
return cachedToken, nil
}
// Cache miss or expiring soon - validate OAuth and get new service token
if cachedToken == "" {
slog.Debug("Service token cache miss, fetching new token", "did", did)
} else {
slog.Debug("Service token expiring soon, proactively renewing", "did", did)
}
// Use DoWithSession to hold the lock through the entire PDS interaction.
// This prevents DPoP nonce races when multiple goroutines try to fetch service tokens.
var serviceToken string
var fetchErr error
err := refresher.DoWithSession(ctx, did, func(session *indigo_oauth.ClientSession) error {
// Double-check cache after acquiring lock - another goroutine may have
// populated it while we were waiting (classic double-checked locking pattern)
cachedToken, expiresAt := GetServiceToken(did, holdDID)
if cachedToken != "" && time.Until(expiresAt) > 10*time.Second {
slog.Debug("Service token cache hit after lock acquisition",
"did", did,
"expiresIn", time.Until(expiresAt).Round(time.Second))
serviceToken = cachedToken
return nil
}
// Cache still empty/expired - proceed with PDS call
// Request 5-minute expiry (PDS may grant less)
// exp must be absolute Unix timestamp, not relative duration
// Note: OAuth scope includes #atcr_hold fragment, but service auth aud must be bare DID
expiryTime := time.Now().Unix() + 300 // 5 minutes from now
serviceAuthURL := fmt.Sprintf("%s%s?aud=%s&lxm=%s&exp=%d",
pdsEndpoint,
atproto.ServerGetServiceAuth,
url.QueryEscape(holdDID),
url.QueryEscape("com.atproto.repo.getRecord"),
expiryTime,
)
req, err := http.NewRequestWithContext(ctx, "GET", serviceAuthURL, nil)
if err != nil {
fetchErr = fmt.Errorf("failed to create service auth request: %w", err)
return fetchErr
}
// Use OAuth session to authenticate to PDS (with DPoP)
// The lock is held, so DPoP nonce negotiation is serialized per-DID
resp, err := session.DoWithAuth(session.Client, req, "com.atproto.server.getServiceAuth")
if err != nil {
// Auth error - may indicate expired tokens or corrupted session
InvalidateServiceToken(did, holdDID)
// Inspect the error to extract detailed information from indigo's APIError
var apiErr *atclient.APIError
if errors.As(err, &apiErr) {
// Log detailed API error information
slog.Error("OAuth authentication failed during service token request",
"component", "token/servicetoken",
"did", did,
"holdDID", holdDID,
"pdsEndpoint", pdsEndpoint,
"url", serviceAuthURL,
"error", err,
"httpStatus", apiErr.StatusCode,
"errorName", apiErr.Name,
"errorMessage", apiErr.Message,
"hint", getErrorHint(apiErr))
} else {
// Fallback for non-API errors (network errors, etc.)
slog.Error("OAuth authentication failed during service token request",
"component", "token/servicetoken",
"did", did,
"holdDID", holdDID,
"pdsEndpoint", pdsEndpoint,
"url", serviceAuthURL,
"error", err,
"errorType", fmt.Sprintf("%T", err),
"hint", "Network error or unexpected failure during OAuth request")
}
fetchErr = fmt.Errorf("OAuth validation failed: %w", err)
return fetchErr
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
// Service auth failed
bodyBytes, _ := io.ReadAll(resp.Body)
InvalidateServiceToken(did, holdDID)
slog.Error("Service token request returned non-200 status",
"component", "token/servicetoken",
"did", did,
"holdDID", holdDID,
"pdsEndpoint", pdsEndpoint,
"statusCode", resp.StatusCode,
"responseBody", string(bodyBytes),
"hint", "PDS rejected the service token request - check PDS logs for details")
fetchErr = fmt.Errorf("service auth failed with status %d: %s", resp.StatusCode, string(bodyBytes))
return fetchErr
}
// Parse response to get service token
var result struct {
Token string `json:"token"`
}
if err := json.NewDecoder(resp.Body).Decode(&result); err != nil {
fetchErr = fmt.Errorf("failed to decode service auth response: %w", err)
return fetchErr
}
if result.Token == "" {
fetchErr = fmt.Errorf("empty token in service auth response")
return fetchErr
}
serviceToken = result.Token
return nil
})
if err != nil {
// DoWithSession failed (session load or callback error)
InvalidateServiceToken(did, holdDID)
// Try to extract detailed error information
var apiErr *atclient.APIError
if errors.As(err, &apiErr) {
slog.Error("Failed to get OAuth session for service token",
"component", "token/servicetoken",
"did", did,
"holdDID", holdDID,
"pdsEndpoint", pdsEndpoint,
"error", err,
"httpStatus", apiErr.StatusCode,
"errorName", apiErr.Name,
"errorMessage", apiErr.Message,
"hint", getErrorHint(apiErr))
} else if fetchErr == nil {
// Session load failed (not a fetch error)
slog.Error("Failed to get OAuth session for service token",
"component", "token/servicetoken",
"did", did,
"holdDID", holdDID,
"pdsEndpoint", pdsEndpoint,
"error", err,
"errorType", fmt.Sprintf("%T", err),
"hint", "OAuth session not found in database or token refresh failed")
}
// Delete the stale OAuth session to force re-authentication
// This also invalidates the UI session automatically
if delErr := refresher.DeleteSession(ctx, did); delErr != nil {
slog.Warn("Failed to delete stale OAuth session",
"component", "token/servicetoken",
"did", did,
"error", delErr)
}
if fetchErr != nil {
return "", fetchErr
}
return "", fmt.Errorf("failed to get OAuth session: %w", err)
}
// Cache the token (parses JWT to extract actual expiry)
if err := SetServiceToken(did, holdDID, serviceToken); err != nil {
slog.Warn("Failed to cache service token", "error", err, "did", did, "holdDID", holdDID)
// Non-fatal - we have the token, just won't be cached
}
slog.Debug("OAuth validation succeeded, service token obtained", "did", did)
return serviceToken, nil
}
// GetOrFetchServiceTokenWithAppPassword gets a service token using app-password Bearer authentication.
// Used when auth method is app_password instead of OAuth.
func GetOrFetchServiceTokenWithAppPassword(
ctx context.Context,
did, holdDID, pdsEndpoint string,
) (string, error) {
// Check cache first to avoid unnecessary PDS calls on every request
cachedToken, expiresAt := GetServiceToken(did, holdDID)
// Use cached token if it exists and has > 10s remaining
if cachedToken != "" && time.Until(expiresAt) > 10*time.Second {
slog.Debug("Using cached service token (app-password)",
"did", did,
"expiresIn", time.Until(expiresAt).Round(time.Second))
return cachedToken, nil
}
// Cache miss or expiring soon - get app-password token and fetch new service token
if cachedToken == "" {
slog.Debug("Service token cache miss, fetching new token with app-password", "did", did)
} else {
slog.Debug("Service token expiring soon, proactively renewing with app-password", "did", did)
}
// Get app-password access token from cache
accessToken, ok := GetGlobalTokenCache().Get(did)
if !ok {
InvalidateServiceToken(did, holdDID)
slog.Error("No app-password access token found in cache",
"component", "token/servicetoken",
"did", did,
"holdDID", holdDID,
"hint", "User must re-authenticate with docker login")
return "", fmt.Errorf("no app-password access token available for DID %s", did)
}
// Call com.atproto.server.getServiceAuth on the user's PDS with Bearer token
// Request 5-minute expiry (PDS may grant less)
// exp must be absolute Unix timestamp, not relative duration
expiryTime := time.Now().Unix() + 300 // 5 minutes from now
serviceAuthURL := fmt.Sprintf("%s%s?aud=%s&lxm=%s&exp=%d",
pdsEndpoint,
atproto.ServerGetServiceAuth,
url.QueryEscape(holdDID),
url.QueryEscape("com.atproto.repo.getRecord"),
expiryTime,
)
req, err := http.NewRequestWithContext(ctx, "GET", serviceAuthURL, nil)
if err != nil {
return "", fmt.Errorf("failed to create service auth request: %w", err)
}
// Set Bearer token authentication (app-password)
req.Header.Set("Authorization", "Bearer "+accessToken)
// Make request with standard HTTP client
resp, err := http.DefaultClient.Do(req)
if err != nil {
InvalidateServiceToken(did, holdDID)
slog.Error("App-password service token request failed",
"component", "token/servicetoken",
"did", did,
"holdDID", holdDID,
"pdsEndpoint", pdsEndpoint,
"error", err)
return "", fmt.Errorf("failed to request service token: %w", err)
}
defer resp.Body.Close()
if resp.StatusCode == http.StatusUnauthorized {
// App-password token is invalid or expired - clear from cache
GetGlobalTokenCache().Delete(did)
InvalidateServiceToken(did, holdDID)
slog.Error("App-password token rejected by PDS",
"component", "token/servicetoken",
"did", did,
"hint", "User must re-authenticate with docker login")
return "", fmt.Errorf("app-password authentication failed: token expired or invalid")
}
if resp.StatusCode != http.StatusOK {
// Service auth failed
bodyBytes, _ := io.ReadAll(resp.Body)
InvalidateServiceToken(did, holdDID)
slog.Error("Service token request returned non-200 status (app-password)",
"component", "token/servicetoken",
"did", did,
"holdDID", holdDID,
"pdsEndpoint", pdsEndpoint,
"statusCode", resp.StatusCode,
"responseBody", string(bodyBytes))
return "", fmt.Errorf("service auth failed with status %d: %s", resp.StatusCode, string(bodyBytes))
}
// Parse response to get service token
var result struct {
Token string `json:"token"`
}
if err := json.NewDecoder(resp.Body).Decode(&result); err != nil {
return "", fmt.Errorf("failed to decode service auth response: %w", err)
}
if result.Token == "" {
return "", fmt.Errorf("empty token in service auth response")
}
serviceToken := result.Token
// Cache the token (parses JWT to extract actual expiry)
if err := SetServiceToken(did, holdDID, serviceToken); err != nil {
slog.Warn("Failed to cache service token", "error", err, "did", did, "holdDID", holdDID)
// Non-fatal - we have the token, just won't be cached
}
slog.Debug("App-password validation succeeded, service token obtained", "did", did)
return serviceToken, nil
}