Files
Codewalkers/apps/server/agent/lifecycle/controller.ts
Lukas May 28521e1c20 chore: merge main into cw/small-change-flow
Integrates main branch changes (headquarters dashboard, task retry count,
agent prompt persistence, remote sync improvements) with the initiative's
errand agent feature. Both features coexist in the merged result.

Key resolutions:
- Schema: take main's errands table (nullable projectId, no conflictFiles,
  with errandsRelations); migrate to 0035_faulty_human_fly
- Router: keep both errandProcedures and headquartersProcedures
- Errand prompt: take main's simpler version (no question-asking flow)
- Manager: take main's status check (running|idle only, no waiting_for_input)
- Tests: update to match removed conflictFiles field and undefined vs null
2026-03-06 16:48:12 +01:00

380 lines
12 KiB
TypeScript

/**
* AgentLifecycleController — Unified orchestrator for complete agent lifecycle.
*
* Replaces scattered lifecycle logic with comprehensive orchestration including:
* - Always clear signal.json before spawn/resume
* - Robust process completion waiting
* - Retry up to 3 times with comprehensive error handling
* - Auth/usage limit error detection with account switching
* - Missing signal recovery with instruction prompts
* - Debug mode archival vs production cleanup
*/
import { createModuleLogger } from '../../logger/index.js';
import type { AgentRepository } from '../../db/repositories/agent-repository.js';
import type { AccountRepository } from '../../db/repositories/account-repository.js';
import type { ProcessManager } from '../process-manager.js';
import type { CleanupManager } from '../cleanup-manager.js';
import type { SpawnAgentOptions } from '../types.js';
import type { SignalManager, SignalData } from './signal-manager.js';
import type { RetryPolicy, AgentError } from './retry-policy.js';
import { AgentExhaustedError, AgentFailureError } from './retry-policy.js';
import type { AgentErrorAnalyzer } from './error-analyzer.js';
import type { CleanupStrategy, AgentInfo } from './cleanup-strategy.js';
import type { EventBus, AgentAccountSwitchedEvent } from '../../events/types.js';
const log = createModuleLogger('lifecycle-controller');
export interface CompletionResult {
success: boolean;
signal?: SignalData;
error?: Error;
exitCode?: number | null;
stderr?: string;
}
export interface ResumeAgentOptions {
agentId: string;
answers: Record<string, string>;
}
export class AgentLifecycleController {
constructor(
private signalManager: SignalManager,
private retryPolicy: RetryPolicy,
private errorAnalyzer: AgentErrorAnalyzer,
private processManager: ProcessManager,
private repository: AgentRepository,
private cleanupManager: CleanupManager,
private cleanupStrategy: CleanupStrategy,
private accountRepository?: AccountRepository,
private debug: boolean = false,
private eventBus?: EventBus,
) {}
/**
* Execute spawn operation with comprehensive retry and error handling.
* Always clears signal.json before starting and waits for process completion.
*/
async spawnWithRetry(
spawnFn: (options: SpawnAgentOptions) => Promise<AgentInfo>,
options: SpawnAgentOptions
): Promise<AgentInfo> {
log.info({
taskId: options.taskId,
provider: options.provider,
initiativeId: options.initiativeId,
mode: options.mode
}, 'starting agent spawn with retry');
return this.executeWithRetry('spawn', spawnFn, options);
}
/**
* Execute resume operation with comprehensive retry and error handling.
* Always clears signal.json before resuming and waits for process completion.
*/
async resumeWithRetry(
resumeFn: (agentId: string, answers: Record<string, string>) => Promise<void>,
options: ResumeAgentOptions
): Promise<void> {
log.info({
agentId: options.agentId,
answerKeys: Object.keys(options.answers)
}, 'starting agent resume with retry');
await this.executeWithRetry('resume', async () => {
await resumeFn(options.agentId, options.answers);
const agent = await this.repository.findById(options.agentId);
if (!agent) throw new Error(`Agent '${options.agentId}' not found after resume`);
return this.toAgentInfo(agent);
}, options);
}
/**
* Main retry orchestrator for spawn/resume operations.
*/
private async executeWithRetry<T>(
operation: 'spawn' | 'resume',
operationFn: (options: T) => Promise<AgentInfo>,
options: T
): Promise<AgentInfo> {
for (let attempt = 1; attempt <= this.retryPolicy.maxAttempts; attempt++) {
try {
log.debug({ operation, attempt, maxAttempts: this.retryPolicy.maxAttempts }, 'starting attempt');
// Execute operation
const agent = await operationFn(options);
const agentWorkdir = this.processManager.getAgentWorkdir(agent.worktreeId);
// CRITICAL: Always clear signal.json before start
log.debug({ agentId: agent.id, agentWorkdir }, 'clearing signal.json before process start');
await this.signalManager.clearSignal(agentWorkdir);
// Wait for process completion with robust detection
const result = await this.waitForCompletion(agent);
if (result.success) {
// Handle post-completion cleanup
await this.handlePostCompletion(agent);
log.info({
agentId: agent.id,
name: agent.name,
attempt,
operation
}, 'agent lifecycle completed successfully');
return agent;
}
// Analyze error and determine retry strategy
const agentError = await this.errorAnalyzer.analyzeError(
result.error || new Error('Unknown completion failure'),
result.exitCode,
result.stderr,
agentWorkdir
);
// Persist error to DB if required
if (agentError.shouldPersistToDB) {
await this.persistError(agent.id, agentError);
}
// Handle account switching for usage limits
if (agentError.requiresAccountSwitch) {
await this.handleAccountExhaustion(agent.id);
throw new AgentExhaustedError(agentError.message, agentError);
}
// Check if should retry
if (!this.retryPolicy.shouldRetry(agentError, attempt)) {
log.warn({
agentId: agent.id,
errorType: agentError.type,
attempt,
maxAttempts: this.retryPolicy.maxAttempts
}, 'max retry attempts reached or error not retriable');
throw new AgentFailureError(agentError.message, agentError);
}
// Handle special retry cases
if (agentError.type === 'missing_signal') {
// This would need to modify the options to add instruction prompt
// For now, log the special case
log.info({
agentId: agent.id,
attempt
}, 'will retry with missing signal instruction (not yet implemented)');
}
// Wait before retry
const delay = this.retryPolicy.getRetryDelay(attempt);
log.info({
agentId: agent.id,
attempt,
delay,
errorType: agentError.type,
errorMessage: agentError.message
}, 'retrying after delay');
await this.delay(delay);
} catch (error) {
if (error instanceof AgentExhaustedError || error instanceof AgentFailureError) {
throw error; // Don't retry these
}
if (attempt === this.retryPolicy.maxAttempts) {
log.error({
operation,
attempt,
error: error instanceof Error ? error.message : String(error)
}, 'final attempt failed, giving up');
throw error;
}
log.warn({
operation,
attempt,
error: error instanceof Error ? error.message : String(error)
}, 'attempt failed, will retry');
}
}
throw new Error('Unexpected: retry loop completed without success or terminal error');
}
/**
* Wait for process completion with robust signal detection.
* Replaces scattered completion detection with unified approach.
*/
private async waitForCompletion(agent: AgentInfo): Promise<CompletionResult> {
const agentWorkdir = this.processManager.getAgentWorkdir(agent.worktreeId);
log.debug({
agentId: agent.id,
name: agent.name,
agentWorkdir
}, 'waiting for process completion');
// Wait for process to exit (this would need integration with ProcessManager)
// For now, simulate with a timeout approach
// TODO: Implement waitForProcessCompletion in ProcessManager
// Wait for signal within reasonable timeout (30 seconds)
const signal = await this.signalManager.waitForSignal(agentWorkdir, 30000);
if (signal) {
log.debug({
agentId: agent.id,
signalStatus: signal.status
}, 'agent completed with valid signal');
return { success: true, signal };
}
// No signal found - this is an error condition
log.warn({
agentId: agent.id,
agentWorkdir
}, 'process completed without valid signal.json');
return {
success: false,
error: new Error('Process completed without valid signal.json'),
exitCode: null // Would get from ProcessManager
};
}
/**
* Handle post-completion cleanup based on agent status and debug mode.
*/
private async handlePostCompletion(agent: AgentInfo): Promise<void> {
// Only cleanup if agent is not waiting for user input
if (agent.status === 'waiting_for_input') {
log.debug({ agentId: agent.id }, 'agent waiting for input, skipping cleanup');
return;
}
try {
const cleanupAction = await this.cleanupStrategy.shouldCleanup(agent, this.debug);
await this.cleanupStrategy.executeCleanup(agent, cleanupAction);
log.debug({
agentId: agent.id,
name: agent.name,
cleanupAction
}, 'post-completion cleanup executed');
} catch (error) {
log.warn({
agentId: agent.id,
error: error instanceof Error ? error.message : String(error)
}, 'post-completion cleanup failed');
}
}
/**
* Persist error details to database for debugging.
*/
private async persistError(agentId: string, error: AgentError): Promise<void> {
try {
const errorData = {
errorType: error.type,
errorMessage: error.message,
exitCode: error.exitCode,
isTransient: error.isTransient,
requiresAccountSwitch: error.requiresAccountSwitch,
updatedAt: new Date(),
};
// This would need database schema updates to store error details
// For now, just update with basic error info
await this.repository.update(agentId, {
exitCode: error.exitCode,
updatedAt: new Date(),
});
log.debug({
agentId,
errorType: error.type,
exitCode: error.exitCode
}, 'error details persisted to database');
} catch (dbError) {
log.warn({
agentId,
error: dbError instanceof Error ? dbError.message : String(dbError)
}, 'failed to persist error to database');
}
}
/**
* Handle account exhaustion by marking account as exhausted and emitting account_switched event.
*/
private async handleAccountExhaustion(agentId: string): Promise<void> {
if (!this.accountRepository) {
log.debug({ agentId }, 'no account repository available for exhaustion handling');
return;
}
try {
const agent = await this.repository.findById(agentId);
if (!agent?.accountId) {
log.debug({ agentId }, 'agent has no account ID for exhaustion handling');
return;
}
const previousAccountId = agent.accountId;
// Mark account as exhausted for 1 hour
const exhaustedUntil = new Date(Date.now() + 60 * 60 * 1000);
await this.accountRepository.markExhausted(previousAccountId, exhaustedUntil);
log.info({
agentId,
accountId: previousAccountId,
exhaustedUntil
}, 'marked account as exhausted due to usage limits');
// Find the next available account and emit account_switched event
const newAccount = await this.accountRepository.findNextAvailable(agent.provider ?? 'claude');
if (newAccount && this.eventBus) {
const event: AgentAccountSwitchedEvent = {
type: 'agent:account_switched',
timestamp: new Date(),
payload: {
agentId,
name: agent.name,
previousAccountId,
newAccountId: newAccount.id,
reason: 'account_exhausted',
},
};
this.eventBus.emit(event);
}
} catch (error) {
log.warn({
agentId,
error: error instanceof Error ? error.message : String(error)
}, 'failed to mark account as exhausted');
}
}
/**
* Simple delay utility for retry backoff.
*/
private delay(ms: number): Promise<void> {
return new Promise(resolve => setTimeout(resolve, ms));
}
/**
* Convert database agent record to AgentInfo.
*/
private toAgentInfo(agent: any): AgentInfo {
return {
id: agent.id,
name: agent.name,
status: agent.status,
initiativeId: agent.initiativeId,
worktreeId: agent.worktreeId,
exitCode: agent.exitCode ?? null,
};
}
}