Codewalkers/apps/server/agent/lifecycle/error-analyzer.ts

/**
 * ErrorAnalyzer — Intelligent error classification and handling strategies.
 *
 * Analyzes various error conditions from agent processes and classifies them
 * for appropriate retry and recovery strategies. Replaces scattered error
 * handling with centralized, comprehensive error analysis.
 */

import { createModuleLogger } from '../../logger/index.js';
import type { SignalManager } from './signal-manager.js';
import type { AgentError, AgentErrorType } from './retry-policy.js';

const log = createModuleLogger('error-analyzer');

// Common error patterns for different providers
const ERROR_PATTERNS = {
  auth_failure: [
    /unauthorized/i,
    /invalid.*(token|key|credential)/i,
    /authentication.*failed/i,
    /401/,
    /access.*denied/i,
    /invalid.*session/i,
    /expired.*token/i,
  ],
  usage_limit: [
    /rate.*(limit|exceeded)/i,
    /quota.*exceeded/i,
    /too.*many.*requests/i,
    /429/,
    /usage.*limit/i,
    /throttled/i,
    /credit.*insufficient/i,
    /api.*limit.*reached/i,
  ],
  timeout: [
    /timeout/i,
    /timed.*out/i,
    /deadline.*exceeded/i,
    /connection.*timeout/i,
    /read.*timeout/i,
  ],
  process_crash: [
    /segmentation.*fault/i,
    /core.*dumped/i,
    /fatal.*error/i,
    /killed/i,
    /aborted/i,
  ],
};

export class AgentErrorAnalyzer {
  constructor(private signalManager: SignalManager) {}

  /**
   * Analyze an error and classify it for retry strategy.
   * Combines multiple signals: error message, exit code, stderr, and workdir state.
   */
  async analyzeError(
    error: Error | string,
    exitCode?: number | null,
    stderr?: string,
    agentWorkdir?: string
  ): Promise<AgentError> {
    const errorMessage = error instanceof Error ? error.message : String(error);
    const fullContext = [errorMessage, stderr].filter(Boolean).join(' ');

    log.debug({
      errorMessage,
      exitCode,
      hasStderr: !!stderr,
      hasWorkdir: !!agentWorkdir
    }, 'analyzing agent error');

    // Check for auth failure patterns
    if (this.matchesPattern(fullContext, ERROR_PATTERNS.auth_failure)) {
      return {
        type: 'auth_failure',
        message: errorMessage,
        isTransient: true,
        requiresAccountSwitch: false,
        shouldPersistToDB: true,
        exitCode,
        originalError: error instanceof Error ? error : undefined,
      };
    }

    // Check for usage limit patterns
    if (this.matchesPattern(fullContext, ERROR_PATTERNS.usage_limit)) {
      return {
        type: 'usage_limit',
        message: errorMessage,
        isTransient: false,
        requiresAccountSwitch: true,
        shouldPersistToDB: true,
        exitCode,
        originalError: error instanceof Error ? error : undefined,
      };
    }

    // Check for timeout patterns
    if (this.matchesPattern(fullContext, ERROR_PATTERNS.timeout)) {
      return {
        type: 'timeout',
        message: errorMessage,
        isTransient: true,
        requiresAccountSwitch: false,
        shouldPersistToDB: true,
        exitCode,
        originalError: error instanceof Error ? error : undefined,
      };
    }

    // Special case: process completed successfully but no signal.json
    if (agentWorkdir && exitCode === 0) {
      const hasSignal = await this.signalManager.checkSignalExists(agentWorkdir);
      if (!hasSignal) {
        log.debug({ agentWorkdir }, 'process completed successfully but no signal.json found');
        return {
          type: 'missing_signal',
          message: 'Process completed successfully but no signal.json was generated',
          isTransient: true,
          requiresAccountSwitch: false,
          shouldPersistToDB: false,
          exitCode,
          originalError: error instanceof Error ? error : undefined,
        };
      }
    }

    // Check for process crash patterns
    if (this.matchesPattern(fullContext, ERROR_PATTERNS.process_crash) ||
        (exitCode !== null && exitCode !== 0 && exitCode !== undefined)) {

      // Determine if crash is transient based on exit code and patterns
      const isTransient = this.isTransientCrash(exitCode, stderr);

      return {
        type: 'process_crash',
        message: errorMessage,
        isTransient,
        requiresAccountSwitch: false,
        shouldPersistToDB: true,
        exitCode,
        originalError: error instanceof Error ? error : undefined,
      };
    }

    // Unknown error type
    log.debug({
      errorMessage,
      exitCode,
      stderr: stderr?.substring(0, 200) + '...'
    }, 'error does not match known patterns, classifying as unknown');

    return {
      type: 'unknown',
      message: errorMessage,
      isTransient: false,
      requiresAccountSwitch: false,
      shouldPersistToDB: true,
      exitCode,
      originalError: error instanceof Error ? error : undefined,
    };
  }

  /**
   * Validate credentials with a brief test request using invalid token.
   * This helps distinguish between token expiry vs. account exhaustion.
   */
  async validateTokenWithInvalidRequest(accountId: string): Promise<boolean> {
    // User requirement: "brief check with invalid access token to determine behavior"
    // This would need integration with credential system and is provider-specific
    // For now, return true to indicate token appears valid
    log.debug({ accountId }, 'token validation requested (not yet implemented)');
    return true;
  }

  /**
   * Check if error message or stderr matches any of the given patterns.
   */
  private matchesPattern(text: string, patterns: RegExp[]): boolean {
    if (!text) return false;
    return patterns.some(pattern => pattern.test(text));
  }

  /**
   * Determine if a process crash is likely transient (can be retried).
   * Based on exit codes and stderr content.
   */
  private isTransientCrash(exitCode?: number | null, stderr?: string): boolean {
    // Exit codes that indicate transient failures
    const transientExitCodes = new Set([
      130, // SIGINT (interrupted)
      143, // SIGTERM (terminated)
      124, // timeout command
      1,   // Generic error (might be transient)
    ]);

    if (exitCode !== null && exitCode !== undefined) {
      if (transientExitCodes.has(exitCode)) {
        log.debug({ exitCode }, 'exit code indicates transient failure');
        return true;
      }

      // Very high exit codes often indicate system issues
      if (exitCode > 128 && exitCode < 256) {
        log.debug({ exitCode }, 'signal-based exit code may be transient');
        return true;
      }
    }

    // Check stderr for transient patterns
    if (stderr) {
      const transientPatterns = [
        /temporary/i,
        /network.*error/i,
        /connection.*refused/i,
        /service.*unavailable/i,
        /disk.*full/i,
        /out.*of.*memory/i,
      ];

      if (transientPatterns.some(pattern => pattern.test(stderr))) {
        log.debug({ stderr: stderr.substring(0, 100) + '...' }, 'stderr indicates transient failure');
        return true;
      }
    }

    log.debug({ exitCode, hasStderr: !!stderr }, 'crash appears non-transient');
    return false;
  }
}