Integrates main branch changes (headquarters dashboard, task retry count, agent prompt persistence, remote sync improvements) with the initiative's errand agent feature. Both features coexist in the merged result. Key resolutions: - Schema: take main's errands table (nullable projectId, no conflictFiles, with errandsRelations); migrate to 0035_faulty_human_fly - Router: keep both errandProcedures and headquartersProcedures - Errand prompt: take main's simpler version (no question-asking flow) - Manager: take main's status check (running|idle only, no waiting_for_input) - Tests: update to match removed conflictFiles field and undefined vs null
380 lines
12 KiB
TypeScript
380 lines
12 KiB
TypeScript
/**
|
|
* AgentLifecycleController — Unified orchestrator for complete agent lifecycle.
|
|
*
|
|
* Replaces scattered lifecycle logic with comprehensive orchestration including:
|
|
* - Always clear signal.json before spawn/resume
|
|
* - Robust process completion waiting
|
|
* - Retry up to 3 times with comprehensive error handling
|
|
* - Auth/usage limit error detection with account switching
|
|
* - Missing signal recovery with instruction prompts
|
|
* - Debug mode archival vs production cleanup
|
|
*/
|
|
|
|
import { createModuleLogger } from '../../logger/index.js';
|
|
import type { AgentRepository } from '../../db/repositories/agent-repository.js';
|
|
import type { AccountRepository } from '../../db/repositories/account-repository.js';
|
|
import type { ProcessManager } from '../process-manager.js';
|
|
import type { CleanupManager } from '../cleanup-manager.js';
|
|
import type { SpawnAgentOptions } from '../types.js';
|
|
import type { SignalManager, SignalData } from './signal-manager.js';
|
|
import type { RetryPolicy, AgentError } from './retry-policy.js';
|
|
import { AgentExhaustedError, AgentFailureError } from './retry-policy.js';
|
|
import type { AgentErrorAnalyzer } from './error-analyzer.js';
|
|
import type { CleanupStrategy, AgentInfo } from './cleanup-strategy.js';
|
|
import type { EventBus, AgentAccountSwitchedEvent } from '../../events/types.js';
|
|
|
|
const log = createModuleLogger('lifecycle-controller');
|
|
|
|
export interface CompletionResult {
|
|
success: boolean;
|
|
signal?: SignalData;
|
|
error?: Error;
|
|
exitCode?: number | null;
|
|
stderr?: string;
|
|
}
|
|
|
|
export interface ResumeAgentOptions {
|
|
agentId: string;
|
|
answers: Record<string, string>;
|
|
}
|
|
|
|
export class AgentLifecycleController {
|
|
constructor(
|
|
private signalManager: SignalManager,
|
|
private retryPolicy: RetryPolicy,
|
|
private errorAnalyzer: AgentErrorAnalyzer,
|
|
private processManager: ProcessManager,
|
|
private repository: AgentRepository,
|
|
private cleanupManager: CleanupManager,
|
|
private cleanupStrategy: CleanupStrategy,
|
|
private accountRepository?: AccountRepository,
|
|
private debug: boolean = false,
|
|
private eventBus?: EventBus,
|
|
) {}
|
|
|
|
/**
|
|
* Execute spawn operation with comprehensive retry and error handling.
|
|
* Always clears signal.json before starting and waits for process completion.
|
|
*/
|
|
async spawnWithRetry(
|
|
spawnFn: (options: SpawnAgentOptions) => Promise<AgentInfo>,
|
|
options: SpawnAgentOptions
|
|
): Promise<AgentInfo> {
|
|
log.info({
|
|
taskId: options.taskId,
|
|
provider: options.provider,
|
|
initiativeId: options.initiativeId,
|
|
mode: options.mode
|
|
}, 'starting agent spawn with retry');
|
|
|
|
return this.executeWithRetry('spawn', spawnFn, options);
|
|
}
|
|
|
|
/**
|
|
* Execute resume operation with comprehensive retry and error handling.
|
|
* Always clears signal.json before resuming and waits for process completion.
|
|
*/
|
|
async resumeWithRetry(
|
|
resumeFn: (agentId: string, answers: Record<string, string>) => Promise<void>,
|
|
options: ResumeAgentOptions
|
|
): Promise<void> {
|
|
log.info({
|
|
agentId: options.agentId,
|
|
answerKeys: Object.keys(options.answers)
|
|
}, 'starting agent resume with retry');
|
|
|
|
await this.executeWithRetry('resume', async () => {
|
|
await resumeFn(options.agentId, options.answers);
|
|
const agent = await this.repository.findById(options.agentId);
|
|
if (!agent) throw new Error(`Agent '${options.agentId}' not found after resume`);
|
|
return this.toAgentInfo(agent);
|
|
}, options);
|
|
}
|
|
|
|
/**
|
|
* Main retry orchestrator for spawn/resume operations.
|
|
*/
|
|
private async executeWithRetry<T>(
|
|
operation: 'spawn' | 'resume',
|
|
operationFn: (options: T) => Promise<AgentInfo>,
|
|
options: T
|
|
): Promise<AgentInfo> {
|
|
|
|
for (let attempt = 1; attempt <= this.retryPolicy.maxAttempts; attempt++) {
|
|
try {
|
|
log.debug({ operation, attempt, maxAttempts: this.retryPolicy.maxAttempts }, 'starting attempt');
|
|
|
|
// Execute operation
|
|
const agent = await operationFn(options);
|
|
const agentWorkdir = this.processManager.getAgentWorkdir(agent.worktreeId);
|
|
|
|
// CRITICAL: Always clear signal.json before start
|
|
log.debug({ agentId: agent.id, agentWorkdir }, 'clearing signal.json before process start');
|
|
await this.signalManager.clearSignal(agentWorkdir);
|
|
|
|
// Wait for process completion with robust detection
|
|
const result = await this.waitForCompletion(agent);
|
|
|
|
if (result.success) {
|
|
// Handle post-completion cleanup
|
|
await this.handlePostCompletion(agent);
|
|
log.info({
|
|
agentId: agent.id,
|
|
name: agent.name,
|
|
attempt,
|
|
operation
|
|
}, 'agent lifecycle completed successfully');
|
|
return agent;
|
|
}
|
|
|
|
// Analyze error and determine retry strategy
|
|
const agentError = await this.errorAnalyzer.analyzeError(
|
|
result.error || new Error('Unknown completion failure'),
|
|
result.exitCode,
|
|
result.stderr,
|
|
agentWorkdir
|
|
);
|
|
|
|
// Persist error to DB if required
|
|
if (agentError.shouldPersistToDB) {
|
|
await this.persistError(agent.id, agentError);
|
|
}
|
|
|
|
// Handle account switching for usage limits
|
|
if (agentError.requiresAccountSwitch) {
|
|
await this.handleAccountExhaustion(agent.id);
|
|
throw new AgentExhaustedError(agentError.message, agentError);
|
|
}
|
|
|
|
// Check if should retry
|
|
if (!this.retryPolicy.shouldRetry(agentError, attempt)) {
|
|
log.warn({
|
|
agentId: agent.id,
|
|
errorType: agentError.type,
|
|
attempt,
|
|
maxAttempts: this.retryPolicy.maxAttempts
|
|
}, 'max retry attempts reached or error not retriable');
|
|
throw new AgentFailureError(agentError.message, agentError);
|
|
}
|
|
|
|
// Handle special retry cases
|
|
if (agentError.type === 'missing_signal') {
|
|
// This would need to modify the options to add instruction prompt
|
|
// For now, log the special case
|
|
log.info({
|
|
agentId: agent.id,
|
|
attempt
|
|
}, 'will retry with missing signal instruction (not yet implemented)');
|
|
}
|
|
|
|
// Wait before retry
|
|
const delay = this.retryPolicy.getRetryDelay(attempt);
|
|
log.info({
|
|
agentId: agent.id,
|
|
attempt,
|
|
delay,
|
|
errorType: agentError.type,
|
|
errorMessage: agentError.message
|
|
}, 'retrying after delay');
|
|
await this.delay(delay);
|
|
|
|
} catch (error) {
|
|
if (error instanceof AgentExhaustedError || error instanceof AgentFailureError) {
|
|
throw error; // Don't retry these
|
|
}
|
|
|
|
if (attempt === this.retryPolicy.maxAttempts) {
|
|
log.error({
|
|
operation,
|
|
attempt,
|
|
error: error instanceof Error ? error.message : String(error)
|
|
}, 'final attempt failed, giving up');
|
|
throw error;
|
|
}
|
|
|
|
log.warn({
|
|
operation,
|
|
attempt,
|
|
error: error instanceof Error ? error.message : String(error)
|
|
}, 'attempt failed, will retry');
|
|
}
|
|
}
|
|
|
|
throw new Error('Unexpected: retry loop completed without success or terminal error');
|
|
}
|
|
|
|
/**
|
|
* Wait for process completion with robust signal detection.
|
|
* Replaces scattered completion detection with unified approach.
|
|
*/
|
|
private async waitForCompletion(agent: AgentInfo): Promise<CompletionResult> {
|
|
const agentWorkdir = this.processManager.getAgentWorkdir(agent.worktreeId);
|
|
|
|
log.debug({
|
|
agentId: agent.id,
|
|
name: agent.name,
|
|
agentWorkdir
|
|
}, 'waiting for process completion');
|
|
|
|
// Wait for process to exit (this would need integration with ProcessManager)
|
|
// For now, simulate with a timeout approach
|
|
// TODO: Implement waitForProcessCompletion in ProcessManager
|
|
|
|
// Wait for signal within reasonable timeout (30 seconds)
|
|
const signal = await this.signalManager.waitForSignal(agentWorkdir, 30000);
|
|
|
|
if (signal) {
|
|
log.debug({
|
|
agentId: agent.id,
|
|
signalStatus: signal.status
|
|
}, 'agent completed with valid signal');
|
|
return { success: true, signal };
|
|
}
|
|
|
|
// No signal found - this is an error condition
|
|
log.warn({
|
|
agentId: agent.id,
|
|
agentWorkdir
|
|
}, 'process completed without valid signal.json');
|
|
|
|
return {
|
|
success: false,
|
|
error: new Error('Process completed without valid signal.json'),
|
|
exitCode: null // Would get from ProcessManager
|
|
};
|
|
}
|
|
|
|
/**
|
|
* Handle post-completion cleanup based on agent status and debug mode.
|
|
*/
|
|
private async handlePostCompletion(agent: AgentInfo): Promise<void> {
|
|
// Only cleanup if agent is not waiting for user input
|
|
if (agent.status === 'waiting_for_input') {
|
|
log.debug({ agentId: agent.id }, 'agent waiting for input, skipping cleanup');
|
|
return;
|
|
}
|
|
|
|
try {
|
|
const cleanupAction = await this.cleanupStrategy.shouldCleanup(agent, this.debug);
|
|
await this.cleanupStrategy.executeCleanup(agent, cleanupAction);
|
|
|
|
log.debug({
|
|
agentId: agent.id,
|
|
name: agent.name,
|
|
cleanupAction
|
|
}, 'post-completion cleanup executed');
|
|
} catch (error) {
|
|
log.warn({
|
|
agentId: agent.id,
|
|
error: error instanceof Error ? error.message : String(error)
|
|
}, 'post-completion cleanup failed');
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Persist error details to database for debugging.
|
|
*/
|
|
private async persistError(agentId: string, error: AgentError): Promise<void> {
|
|
try {
|
|
const errorData = {
|
|
errorType: error.type,
|
|
errorMessage: error.message,
|
|
exitCode: error.exitCode,
|
|
isTransient: error.isTransient,
|
|
requiresAccountSwitch: error.requiresAccountSwitch,
|
|
updatedAt: new Date(),
|
|
};
|
|
|
|
// This would need database schema updates to store error details
|
|
// For now, just update with basic error info
|
|
await this.repository.update(agentId, {
|
|
exitCode: error.exitCode,
|
|
updatedAt: new Date(),
|
|
});
|
|
|
|
log.debug({
|
|
agentId,
|
|
errorType: error.type,
|
|
exitCode: error.exitCode
|
|
}, 'error details persisted to database');
|
|
} catch (dbError) {
|
|
log.warn({
|
|
agentId,
|
|
error: dbError instanceof Error ? dbError.message : String(dbError)
|
|
}, 'failed to persist error to database');
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Handle account exhaustion by marking account as exhausted and emitting account_switched event.
|
|
*/
|
|
private async handleAccountExhaustion(agentId: string): Promise<void> {
|
|
if (!this.accountRepository) {
|
|
log.debug({ agentId }, 'no account repository available for exhaustion handling');
|
|
return;
|
|
}
|
|
|
|
try {
|
|
const agent = await this.repository.findById(agentId);
|
|
if (!agent?.accountId) {
|
|
log.debug({ agentId }, 'agent has no account ID for exhaustion handling');
|
|
return;
|
|
}
|
|
|
|
const previousAccountId = agent.accountId;
|
|
|
|
// Mark account as exhausted for 1 hour
|
|
const exhaustedUntil = new Date(Date.now() + 60 * 60 * 1000);
|
|
await this.accountRepository.markExhausted(previousAccountId, exhaustedUntil);
|
|
|
|
log.info({
|
|
agentId,
|
|
accountId: previousAccountId,
|
|
exhaustedUntil
|
|
}, 'marked account as exhausted due to usage limits');
|
|
|
|
// Find the next available account and emit account_switched event
|
|
const newAccount = await this.accountRepository.findNextAvailable(agent.provider ?? 'claude');
|
|
if (newAccount && this.eventBus) {
|
|
const event: AgentAccountSwitchedEvent = {
|
|
type: 'agent:account_switched',
|
|
timestamp: new Date(),
|
|
payload: {
|
|
agentId,
|
|
name: agent.name,
|
|
previousAccountId,
|
|
newAccountId: newAccount.id,
|
|
reason: 'account_exhausted',
|
|
},
|
|
};
|
|
this.eventBus.emit(event);
|
|
}
|
|
} catch (error) {
|
|
log.warn({
|
|
agentId,
|
|
error: error instanceof Error ? error.message : String(error)
|
|
}, 'failed to mark account as exhausted');
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Simple delay utility for retry backoff.
|
|
*/
|
|
private delay(ms: number): Promise<void> {
|
|
return new Promise(resolve => setTimeout(resolve, ms));
|
|
}
|
|
|
|
/**
|
|
* Convert database agent record to AgentInfo.
|
|
*/
|
|
private toAgentInfo(agent: any): AgentInfo {
|
|
return {
|
|
id: agent.id,
|
|
name: agent.name,
|
|
status: agent.status,
|
|
initiativeId: agent.initiativeId,
|
|
worktreeId: agent.worktreeId,
|
|
exitCode: agent.exitCode ?? null,
|
|
};
|
|
}
|
|
} |