/** * AgentLifecycleController — Unified orchestrator for complete agent lifecycle. * * Replaces scattered lifecycle logic with comprehensive orchestration including: * - Always clear signal.json before spawn/resume * - Robust process completion waiting * - Retry up to 3 times with comprehensive error handling * - Auth/usage limit error detection with account switching * - Missing signal recovery with instruction prompts * - Debug mode archival vs production cleanup */ import { createModuleLogger } from '../../logger/index.js'; import type { AgentRepository } from '../../db/repositories/agent-repository.js'; import type { AccountRepository } from '../../db/repositories/account-repository.js'; import type { ProcessManager } from '../process-manager.js'; import type { CleanupManager } from '../cleanup-manager.js'; import type { SpawnAgentOptions } from '../types.js'; import type { SignalManager, SignalData } from './signal-manager.js'; import type { RetryPolicy, AgentError } from './retry-policy.js'; import { AgentExhaustedError, AgentFailureError } from './retry-policy.js'; import type { AgentErrorAnalyzer } from './error-analyzer.js'; import type { CleanupStrategy, AgentInfo } from './cleanup-strategy.js'; import type { EventBus, AgentAccountSwitchedEvent } from '../../events/types.js'; const log = createModuleLogger('lifecycle-controller'); export interface CompletionResult { success: boolean; signal?: SignalData; error?: Error; exitCode?: number | null; stderr?: string; } export interface ResumeAgentOptions { agentId: string; answers: Record; } export class AgentLifecycleController { constructor( private signalManager: SignalManager, private retryPolicy: RetryPolicy, private errorAnalyzer: AgentErrorAnalyzer, private processManager: ProcessManager, private repository: AgentRepository, private cleanupManager: CleanupManager, private cleanupStrategy: CleanupStrategy, private accountRepository?: AccountRepository, private debug: boolean = false, private eventBus?: EventBus, ) {} /** * Execute spawn operation with comprehensive retry and error handling. * Always clears signal.json before starting and waits for process completion. */ async spawnWithRetry( spawnFn: (options: SpawnAgentOptions) => Promise, options: SpawnAgentOptions ): Promise { log.info({ taskId: options.taskId, provider: options.provider, initiativeId: options.initiativeId, mode: options.mode }, 'starting agent spawn with retry'); return this.executeWithRetry('spawn', spawnFn, options); } /** * Execute resume operation with comprehensive retry and error handling. * Always clears signal.json before resuming and waits for process completion. */ async resumeWithRetry( resumeFn: (agentId: string, answers: Record) => Promise, options: ResumeAgentOptions ): Promise { log.info({ agentId: options.agentId, answerKeys: Object.keys(options.answers) }, 'starting agent resume with retry'); await this.executeWithRetry('resume', async () => { await resumeFn(options.agentId, options.answers); const agent = await this.repository.findById(options.agentId); if (!agent) throw new Error(`Agent '${options.agentId}' not found after resume`); return this.toAgentInfo(agent); }, options); } /** * Main retry orchestrator for spawn/resume operations. */ private async executeWithRetry( operation: 'spawn' | 'resume', operationFn: (options: T) => Promise, options: T ): Promise { for (let attempt = 1; attempt <= this.retryPolicy.maxAttempts; attempt++) { try { log.debug({ operation, attempt, maxAttempts: this.retryPolicy.maxAttempts }, 'starting attempt'); // Execute operation const agent = await operationFn(options); const agentWorkdir = this.processManager.getAgentWorkdir(agent.worktreeId); // CRITICAL: Always clear signal.json before start log.debug({ agentId: agent.id, agentWorkdir }, 'clearing signal.json before process start'); await this.signalManager.clearSignal(agentWorkdir); // Wait for process completion with robust detection const result = await this.waitForCompletion(agent); if (result.success) { // Handle post-completion cleanup await this.handlePostCompletion(agent); log.info({ agentId: agent.id, name: agent.name, attempt, operation }, 'agent lifecycle completed successfully'); return agent; } // Analyze error and determine retry strategy const agentError = await this.errorAnalyzer.analyzeError( result.error || new Error('Unknown completion failure'), result.exitCode, result.stderr, agentWorkdir ); // Persist error to DB if required if (agentError.shouldPersistToDB) { await this.persistError(agent.id, agentError); } // Handle account switching for usage limits if (agentError.requiresAccountSwitch) { await this.handleAccountExhaustion(agent.id); throw new AgentExhaustedError(agentError.message, agentError); } // Check if should retry if (!this.retryPolicy.shouldRetry(agentError, attempt)) { log.warn({ agentId: agent.id, errorType: agentError.type, attempt, maxAttempts: this.retryPolicy.maxAttempts }, 'max retry attempts reached or error not retriable'); throw new AgentFailureError(agentError.message, agentError); } // Handle special retry cases if (agentError.type === 'missing_signal') { // This would need to modify the options to add instruction prompt // For now, log the special case log.info({ agentId: agent.id, attempt }, 'will retry with missing signal instruction (not yet implemented)'); } // Wait before retry const delay = this.retryPolicy.getRetryDelay(attempt); log.info({ agentId: agent.id, attempt, delay, errorType: agentError.type, errorMessage: agentError.message }, 'retrying after delay'); await this.delay(delay); } catch (error) { if (error instanceof AgentExhaustedError || error instanceof AgentFailureError) { throw error; // Don't retry these } if (attempt === this.retryPolicy.maxAttempts) { log.error({ operation, attempt, error: error instanceof Error ? error.message : String(error) }, 'final attempt failed, giving up'); throw error; } log.warn({ operation, attempt, error: error instanceof Error ? error.message : String(error) }, 'attempt failed, will retry'); } } throw new Error('Unexpected: retry loop completed without success or terminal error'); } /** * Wait for process completion with robust signal detection. * Replaces scattered completion detection with unified approach. */ private async waitForCompletion(agent: AgentInfo): Promise { const agentWorkdir = this.processManager.getAgentWorkdir(agent.worktreeId); log.debug({ agentId: agent.id, name: agent.name, agentWorkdir }, 'waiting for process completion'); // Wait for process to exit (this would need integration with ProcessManager) // For now, simulate with a timeout approach // TODO: Implement waitForProcessCompletion in ProcessManager // Wait for signal within reasonable timeout (30 seconds) const signal = await this.signalManager.waitForSignal(agentWorkdir, 30000); if (signal) { log.debug({ agentId: agent.id, signalStatus: signal.status }, 'agent completed with valid signal'); return { success: true, signal }; } // No signal found - this is an error condition log.warn({ agentId: agent.id, agentWorkdir }, 'process completed without valid signal.json'); return { success: false, error: new Error('Process completed without valid signal.json'), exitCode: null // Would get from ProcessManager }; } /** * Handle post-completion cleanup based on agent status and debug mode. */ private async handlePostCompletion(agent: AgentInfo): Promise { // Only cleanup if agent is not waiting for user input if (agent.status === 'waiting_for_input') { log.debug({ agentId: agent.id }, 'agent waiting for input, skipping cleanup'); return; } try { const cleanupAction = await this.cleanupStrategy.shouldCleanup(agent, this.debug); await this.cleanupStrategy.executeCleanup(agent, cleanupAction); log.debug({ agentId: agent.id, name: agent.name, cleanupAction }, 'post-completion cleanup executed'); } catch (error) { log.warn({ agentId: agent.id, error: error instanceof Error ? error.message : String(error) }, 'post-completion cleanup failed'); } } /** * Persist error details to database for debugging. */ private async persistError(agentId: string, error: AgentError): Promise { try { const errorData = { errorType: error.type, errorMessage: error.message, exitCode: error.exitCode, isTransient: error.isTransient, requiresAccountSwitch: error.requiresAccountSwitch, updatedAt: new Date(), }; // This would need database schema updates to store error details // For now, just update with basic error info await this.repository.update(agentId, { exitCode: error.exitCode, updatedAt: new Date(), }); log.debug({ agentId, errorType: error.type, exitCode: error.exitCode }, 'error details persisted to database'); } catch (dbError) { log.warn({ agentId, error: dbError instanceof Error ? dbError.message : String(dbError) }, 'failed to persist error to database'); } } /** * Handle account exhaustion by marking account as exhausted and emitting account_switched event. */ private async handleAccountExhaustion(agentId: string): Promise { if (!this.accountRepository) { log.debug({ agentId }, 'no account repository available for exhaustion handling'); return; } try { const agent = await this.repository.findById(agentId); if (!agent?.accountId) { log.debug({ agentId }, 'agent has no account ID for exhaustion handling'); return; } const previousAccountId = agent.accountId; // Mark account as exhausted for 1 hour const exhaustedUntil = new Date(Date.now() + 60 * 60 * 1000); await this.accountRepository.markExhausted(previousAccountId, exhaustedUntil); log.info({ agentId, accountId: previousAccountId, exhaustedUntil }, 'marked account as exhausted due to usage limits'); // Find the next available account and emit account_switched event const newAccount = await this.accountRepository.findNextAvailable(agent.provider ?? 'claude'); if (newAccount && this.eventBus) { const event: AgentAccountSwitchedEvent = { type: 'agent:account_switched', timestamp: new Date(), payload: { agentId, name: agent.name, previousAccountId, newAccountId: newAccount.id, reason: 'account_exhausted', }, }; this.eventBus.emit(event); } } catch (error) { log.warn({ agentId, error: error instanceof Error ? error.message : String(error) }, 'failed to mark account as exhausted'); } } /** * Simple delay utility for retry backoff. */ private delay(ms: number): Promise { return new Promise(resolve => setTimeout(resolve, ms)); } /** * Convert database agent record to AgentInfo. */ private toAgentInfo(agent: any): AgentInfo { return { id: agent.id, name: agent.name, status: agent.status, initiativeId: agent.initiativeId, worktreeId: agent.worktreeId, exitCode: agent.exitCode ?? null, }; } }