Codewalkers/apps/server/agent/lifecycle/controller.ts

/**
 * AgentLifecycleController — Unified orchestrator for complete agent lifecycle.
 *
 * Replaces scattered lifecycle logic with comprehensive orchestration including:
 * - Always clear signal.json before spawn/resume
 * - Robust process completion waiting
 * - Retry up to 3 times with comprehensive error handling
 * - Auth/usage limit error detection with account switching
 * - Missing signal recovery with instruction prompts
 * - Debug mode archival vs production cleanup
 */

import { createModuleLogger } from '../../logger/index.js';
import type { AgentRepository } from '../../db/repositories/agent-repository.js';
import type { AccountRepository } from '../../db/repositories/account-repository.js';
import type { ProcessManager } from '../process-manager.js';
import type { CleanupManager } from '../cleanup-manager.js';
import type { SpawnAgentOptions } from '../types.js';
import type { SignalManager, SignalData } from './signal-manager.js';
import type { RetryPolicy, AgentError } from './retry-policy.js';
import { AgentExhaustedError, AgentFailureError } from './retry-policy.js';
import type { AgentErrorAnalyzer } from './error-analyzer.js';
import type { CleanupStrategy, AgentInfo } from './cleanup-strategy.js';
import type { EventBus, AgentAccountSwitchedEvent } from '../../events/types.js';

const log = createModuleLogger('lifecycle-controller');

export interface CompletionResult {
  success: boolean;
  signal?: SignalData;
  error?: Error;
  exitCode?: number | null;
  stderr?: string;
}

export interface ResumeAgentOptions {
  agentId: string;
  answers: Record<string, string>;
}

export class AgentLifecycleController {
  constructor(
    private signalManager: SignalManager,
    private retryPolicy: RetryPolicy,
    private errorAnalyzer: AgentErrorAnalyzer,
    private processManager: ProcessManager,
    private repository: AgentRepository,
    private cleanupManager: CleanupManager,
    private cleanupStrategy: CleanupStrategy,
    private accountRepository?: AccountRepository,
    private debug: boolean = false,
    private eventBus?: EventBus,
  ) {}

  /**
   * Execute spawn operation with comprehensive retry and error handling.
   * Always clears signal.json before starting and waits for process completion.
   */
  async spawnWithRetry(
    spawnFn: (options: SpawnAgentOptions) => Promise<AgentInfo>,
    options: SpawnAgentOptions
  ): Promise<AgentInfo> {
    log.info({
      taskId: options.taskId,
      provider: options.provider,
      initiativeId: options.initiativeId,
      mode: options.mode
    }, 'starting agent spawn with retry');

    return this.executeWithRetry('spawn', spawnFn, options);
  }

  /**
   * Execute resume operation with comprehensive retry and error handling.
   * Always clears signal.json before resuming and waits for process completion.
   */
  async resumeWithRetry(
    resumeFn: (agentId: string, answers: Record<string, string>) => Promise<void>,
    options: ResumeAgentOptions
  ): Promise<void> {
    log.info({
      agentId: options.agentId,
      answerKeys: Object.keys(options.answers)
    }, 'starting agent resume with retry');

    await this.executeWithRetry('resume', async () => {
      await resumeFn(options.agentId, options.answers);
      const agent = await this.repository.findById(options.agentId);
      if (!agent) throw new Error(`Agent '${options.agentId}' not found after resume`);
      return this.toAgentInfo(agent);
    }, options);
  }

  /**
   * Main retry orchestrator for spawn/resume operations.
   */
  private async executeWithRetry<T>(
    operation: 'spawn' | 'resume',
    operationFn: (options: T) => Promise<AgentInfo>,
    options: T
  ): Promise<AgentInfo> {

    for (let attempt = 1; attempt <= this.retryPolicy.maxAttempts; attempt++) {
      try {
        log.debug({ operation, attempt, maxAttempts: this.retryPolicy.maxAttempts }, 'starting attempt');

        // Execute operation
        const agent = await operationFn(options);
        const agentWorkdir = this.processManager.getAgentWorkdir(agent.worktreeId);

        // CRITICAL: Always clear signal.json before start
        log.debug({ agentId: agent.id, agentWorkdir }, 'clearing signal.json before process start');
        await this.signalManager.clearSignal(agentWorkdir);

        // Wait for process completion with robust detection
        const result = await this.waitForCompletion(agent);

        if (result.success) {
          // Handle post-completion cleanup
          await this.handlePostCompletion(agent);
          log.info({
            agentId: agent.id,
            name: agent.name,
            attempt,
            operation
          }, 'agent lifecycle completed successfully');
          return agent;
        }

        // Analyze error and determine retry strategy
        const agentError = await this.errorAnalyzer.analyzeError(
          result.error || new Error('Unknown completion failure'),
          result.exitCode,
          result.stderr,
          agentWorkdir
        );

        // Persist error to DB if required
        if (agentError.shouldPersistToDB) {
          await this.persistError(agent.id, agentError);
        }

        // Handle account switching for usage limits
        if (agentError.requiresAccountSwitch) {
          await this.handleAccountExhaustion(agent.id);
          throw new AgentExhaustedError(agentError.message, agentError);
        }

        // Check if should retry
        if (!this.retryPolicy.shouldRetry(agentError, attempt)) {
          log.warn({
            agentId: agent.id,
            errorType: agentError.type,
            attempt,
            maxAttempts: this.retryPolicy.maxAttempts
          }, 'max retry attempts reached or error not retriable');
          throw new AgentFailureError(agentError.message, agentError);
        }

        // Handle special retry cases
        if (agentError.type === 'missing_signal') {
          // This would need to modify the options to add instruction prompt
          // For now, log the special case
          log.info({
            agentId: agent.id,
            attempt
          }, 'will retry with missing signal instruction (not yet implemented)');
        }

        // Wait before retry
        const delay = this.retryPolicy.getRetryDelay(attempt);
        log.info({
          agentId: agent.id,
          attempt,
          delay,
          errorType: agentError.type,
          errorMessage: agentError.message
        }, 'retrying after delay');
        await this.delay(delay);

      } catch (error) {
        if (error instanceof AgentExhaustedError || error instanceof AgentFailureError) {
          throw error; // Don't retry these
        }

        if (attempt === this.retryPolicy.maxAttempts) {
          log.error({
            operation,
            attempt,
            error: error instanceof Error ? error.message : String(error)
          }, 'final attempt failed, giving up');
          throw error;
        }

        log.warn({
          operation,
          attempt,
          error: error instanceof Error ? error.message : String(error)
        }, 'attempt failed, will retry');
      }
    }

    throw new Error('Unexpected: retry loop completed without success or terminal error');
  }

  /**
   * Wait for process completion with robust signal detection.
   * Replaces scattered completion detection with unified approach.
   */
  private async waitForCompletion(agent: AgentInfo): Promise<CompletionResult> {
    const agentWorkdir = this.processManager.getAgentWorkdir(agent.worktreeId);

    log.debug({
      agentId: agent.id,
      name: agent.name,
      agentWorkdir
    }, 'waiting for process completion');

    // Wait for process to exit (this would need integration with ProcessManager)
    // For now, simulate with a timeout approach
    // TODO: Implement waitForProcessCompletion in ProcessManager

    // Wait for signal within reasonable timeout (30 seconds)
    const signal = await this.signalManager.waitForSignal(agentWorkdir, 30000);

    if (signal) {
      log.debug({
        agentId: agent.id,
        signalStatus: signal.status
      }, 'agent completed with valid signal');
      return { success: true, signal };
    }

    // No signal found - this is an error condition
    log.warn({
      agentId: agent.id,
      agentWorkdir
    }, 'process completed without valid signal.json');

    return {
      success: false,
      error: new Error('Process completed without valid signal.json'),
      exitCode: null // Would get from ProcessManager
    };
  }

  /**
   * Handle post-completion cleanup based on agent status and debug mode.
   */
  private async handlePostCompletion(agent: AgentInfo): Promise<void> {
    // Only cleanup if agent is not waiting for user input
    if (agent.status === 'waiting_for_input') {
      log.debug({ agentId: agent.id }, 'agent waiting for input, skipping cleanup');
      return;
    }

    try {
      const cleanupAction = await this.cleanupStrategy.shouldCleanup(agent, this.debug);
      await this.cleanupStrategy.executeCleanup(agent, cleanupAction);

      log.debug({
        agentId: agent.id,
        name: agent.name,
        cleanupAction
      }, 'post-completion cleanup executed');
    } catch (error) {
      log.warn({
        agentId: agent.id,
        error: error instanceof Error ? error.message : String(error)
      }, 'post-completion cleanup failed');
    }
  }

  /**
   * Persist error details to database for debugging.
   */
  private async persistError(agentId: string, error: AgentError): Promise<void> {
    try {
      const errorData = {
        errorType: error.type,
        errorMessage: error.message,
        exitCode: error.exitCode,
        isTransient: error.isTransient,
        requiresAccountSwitch: error.requiresAccountSwitch,
        updatedAt: new Date(),
      };

      // This would need database schema updates to store error details
      // For now, just update with basic error info
      await this.repository.update(agentId, {
        exitCode: error.exitCode,
        updatedAt: new Date(),
      });

      log.debug({
        agentId,
        errorType: error.type,
        exitCode: error.exitCode
      }, 'error details persisted to database');
    } catch (dbError) {
      log.warn({
        agentId,
        error: dbError instanceof Error ? dbError.message : String(dbError)
      }, 'failed to persist error to database');
    }
  }

  /**
   * Handle account exhaustion by marking account as exhausted and emitting account_switched event.
   */
  private async handleAccountExhaustion(agentId: string): Promise<void> {
    if (!this.accountRepository) {
      log.debug({ agentId }, 'no account repository available for exhaustion handling');
      return;
    }

    try {
      const agent = await this.repository.findById(agentId);
      if (!agent?.accountId) {
        log.debug({ agentId }, 'agent has no account ID for exhaustion handling');
        return;
      }

      const previousAccountId = agent.accountId;

      // Mark account as exhausted for 1 hour
      const exhaustedUntil = new Date(Date.now() + 60 * 60 * 1000);
      await this.accountRepository.markExhausted(previousAccountId, exhaustedUntil);

      log.info({
        agentId,
        accountId: previousAccountId,
        exhaustedUntil
      }, 'marked account as exhausted due to usage limits');

      // Find the next available account and emit account_switched event
      const newAccount = await this.accountRepository.findNextAvailable(agent.provider ?? 'claude');
      if (newAccount && this.eventBus) {
        const event: AgentAccountSwitchedEvent = {
          type: 'agent:account_switched',
          timestamp: new Date(),
          payload: {
            agentId,
            name: agent.name,
            previousAccountId,
            newAccountId: newAccount.id,
            reason: 'account_exhausted',
          },
        };
        this.eventBus.emit(event);
      }
    } catch (error) {
      log.warn({
        agentId,
        error: error instanceof Error ? error.message : String(error)
      }, 'failed to mark account as exhausted');
    }
  }

  /**
   * Simple delay utility for retry backoff.
   */
  private delay(ms: number): Promise<void> {
    return new Promise(resolve => setTimeout(resolve, ms));
  }

  /**
   * Convert database agent record to AgentInfo.
   */
  private toAgentInfo(agent: any): AgentInfo {
    return {
      id: agent.id,
      name: agent.name,
      status: agent.status,
      initiativeId: agent.initiativeId,
      worktreeId: agent.worktreeId,
      exitCode: agent.exitCode ?? null,
    };
  }
}