refactor: Restructure monorepo to apps/server/ and apps/web/ layout
Move src/ → apps/server/ and packages/web/ → apps/web/ to adopt standard monorepo conventions (apps/ for runnable apps, packages/ for reusable libraries). Update all config files, shared package imports, test fixtures, and documentation to reflect new paths. Key fixes: - Update workspace config to ["apps/*", "packages/*"] - Update tsconfig.json rootDir/include for apps/server/ - Add apps/web/** to vitest exclude list - Update drizzle.config.ts schema path - Fix ensure-schema.ts migration path detection (3 levels up in dev, 2 levels up in dist) - Fix tests/integration/cli-server.test.ts import paths - Update packages/shared imports to apps/server/ paths - Update all docs/ files with new paths
This commit is contained in:
358
apps/server/agent/lifecycle/controller.ts
Normal file
358
apps/server/agent/lifecycle/controller.ts
Normal file
@@ -0,0 +1,358 @@
|
||||
/**
|
||||
* AgentLifecycleController — Unified orchestrator for complete agent lifecycle.
|
||||
*
|
||||
* Replaces scattered lifecycle logic with comprehensive orchestration including:
|
||||
* - Always clear signal.json before spawn/resume
|
||||
* - Robust process completion waiting
|
||||
* - Retry up to 3 times with comprehensive error handling
|
||||
* - Auth/usage limit error detection with account switching
|
||||
* - Missing signal recovery with instruction prompts
|
||||
* - Debug mode archival vs production cleanup
|
||||
*/
|
||||
|
||||
import { createModuleLogger } from '../../logger/index.js';
|
||||
import type { AgentRepository } from '../../db/repositories/agent-repository.js';
|
||||
import type { AccountRepository } from '../../db/repositories/account-repository.js';
|
||||
import type { ProcessManager } from '../process-manager.js';
|
||||
import type { CleanupManager } from '../cleanup-manager.js';
|
||||
import type { SpawnAgentOptions } from '../types.js';
|
||||
import type { SignalManager, SignalData } from './signal-manager.js';
|
||||
import type { RetryPolicy, AgentError } from './retry-policy.js';
|
||||
import { AgentExhaustedError, AgentFailureError } from './retry-policy.js';
|
||||
import type { AgentErrorAnalyzer } from './error-analyzer.js';
|
||||
import type { CleanupStrategy, AgentInfo } from './cleanup-strategy.js';
|
||||
|
||||
const log = createModuleLogger('lifecycle-controller');
|
||||
|
||||
export interface CompletionResult {
|
||||
success: boolean;
|
||||
signal?: SignalData;
|
||||
error?: Error;
|
||||
exitCode?: number | null;
|
||||
stderr?: string;
|
||||
}
|
||||
|
||||
export interface ResumeAgentOptions {
|
||||
agentId: string;
|
||||
answers: Record<string, string>;
|
||||
}
|
||||
|
||||
export class AgentLifecycleController {
|
||||
constructor(
|
||||
private signalManager: SignalManager,
|
||||
private retryPolicy: RetryPolicy,
|
||||
private errorAnalyzer: AgentErrorAnalyzer,
|
||||
private processManager: ProcessManager,
|
||||
private repository: AgentRepository,
|
||||
private cleanupManager: CleanupManager,
|
||||
private cleanupStrategy: CleanupStrategy,
|
||||
private accountRepository?: AccountRepository,
|
||||
private debug: boolean = false,
|
||||
) {}
|
||||
|
||||
/**
|
||||
* Execute spawn operation with comprehensive retry and error handling.
|
||||
* Always clears signal.json before starting and waits for process completion.
|
||||
*/
|
||||
async spawnWithRetry(
|
||||
spawnFn: (options: SpawnAgentOptions) => Promise<AgentInfo>,
|
||||
options: SpawnAgentOptions
|
||||
): Promise<AgentInfo> {
|
||||
log.info({
|
||||
taskId: options.taskId,
|
||||
provider: options.provider,
|
||||
initiativeId: options.initiativeId,
|
||||
mode: options.mode
|
||||
}, 'starting agent spawn with retry');
|
||||
|
||||
return this.executeWithRetry('spawn', spawnFn, options);
|
||||
}
|
||||
|
||||
/**
|
||||
* Execute resume operation with comprehensive retry and error handling.
|
||||
* Always clears signal.json before resuming and waits for process completion.
|
||||
*/
|
||||
async resumeWithRetry(
|
||||
resumeFn: (agentId: string, answers: Record<string, string>) => Promise<void>,
|
||||
options: ResumeAgentOptions
|
||||
): Promise<void> {
|
||||
log.info({
|
||||
agentId: options.agentId,
|
||||
answerKeys: Object.keys(options.answers)
|
||||
}, 'starting agent resume with retry');
|
||||
|
||||
await this.executeWithRetry('resume', async () => {
|
||||
await resumeFn(options.agentId, options.answers);
|
||||
const agent = await this.repository.findById(options.agentId);
|
||||
if (!agent) throw new Error(`Agent '${options.agentId}' not found after resume`);
|
||||
return this.toAgentInfo(agent);
|
||||
}, options);
|
||||
}
|
||||
|
||||
/**
|
||||
* Main retry orchestrator for spawn/resume operations.
|
||||
*/
|
||||
private async executeWithRetry<T>(
|
||||
operation: 'spawn' | 'resume',
|
||||
operationFn: (options: T) => Promise<AgentInfo>,
|
||||
options: T
|
||||
): Promise<AgentInfo> {
|
||||
|
||||
for (let attempt = 1; attempt <= this.retryPolicy.maxAttempts; attempt++) {
|
||||
try {
|
||||
log.debug({ operation, attempt, maxAttempts: this.retryPolicy.maxAttempts }, 'starting attempt');
|
||||
|
||||
// Execute operation
|
||||
const agent = await operationFn(options);
|
||||
const agentWorkdir = this.processManager.getAgentWorkdir(agent.worktreeId);
|
||||
|
||||
// CRITICAL: Always clear signal.json before start
|
||||
log.debug({ agentId: agent.id, agentWorkdir }, 'clearing signal.json before process start');
|
||||
await this.signalManager.clearSignal(agentWorkdir);
|
||||
|
||||
// Wait for process completion with robust detection
|
||||
const result = await this.waitForCompletion(agent);
|
||||
|
||||
if (result.success) {
|
||||
// Handle post-completion cleanup
|
||||
await this.handlePostCompletion(agent);
|
||||
log.info({
|
||||
agentId: agent.id,
|
||||
name: agent.name,
|
||||
attempt,
|
||||
operation
|
||||
}, 'agent lifecycle completed successfully');
|
||||
return agent;
|
||||
}
|
||||
|
||||
// Analyze error and determine retry strategy
|
||||
const agentError = await this.errorAnalyzer.analyzeError(
|
||||
result.error || new Error('Unknown completion failure'),
|
||||
result.exitCode,
|
||||
result.stderr,
|
||||
agentWorkdir
|
||||
);
|
||||
|
||||
// Persist error to DB if required
|
||||
if (agentError.shouldPersistToDB) {
|
||||
await this.persistError(agent.id, agentError);
|
||||
}
|
||||
|
||||
// Handle account switching for usage limits
|
||||
if (agentError.requiresAccountSwitch) {
|
||||
await this.handleAccountExhaustion(agent.id);
|
||||
throw new AgentExhaustedError(agentError.message, agentError);
|
||||
}
|
||||
|
||||
// Check if should retry
|
||||
if (!this.retryPolicy.shouldRetry(agentError, attempt)) {
|
||||
log.warn({
|
||||
agentId: agent.id,
|
||||
errorType: agentError.type,
|
||||
attempt,
|
||||
maxAttempts: this.retryPolicy.maxAttempts
|
||||
}, 'max retry attempts reached or error not retriable');
|
||||
throw new AgentFailureError(agentError.message, agentError);
|
||||
}
|
||||
|
||||
// Handle special retry cases
|
||||
if (agentError.type === 'missing_signal') {
|
||||
// This would need to modify the options to add instruction prompt
|
||||
// For now, log the special case
|
||||
log.info({
|
||||
agentId: agent.id,
|
||||
attempt
|
||||
}, 'will retry with missing signal instruction (not yet implemented)');
|
||||
}
|
||||
|
||||
// Wait before retry
|
||||
const delay = this.retryPolicy.getRetryDelay(attempt);
|
||||
log.info({
|
||||
agentId: agent.id,
|
||||
attempt,
|
||||
delay,
|
||||
errorType: agentError.type,
|
||||
errorMessage: agentError.message
|
||||
}, 'retrying after delay');
|
||||
await this.delay(delay);
|
||||
|
||||
} catch (error) {
|
||||
if (error instanceof AgentExhaustedError || error instanceof AgentFailureError) {
|
||||
throw error; // Don't retry these
|
||||
}
|
||||
|
||||
if (attempt === this.retryPolicy.maxAttempts) {
|
||||
log.error({
|
||||
operation,
|
||||
attempt,
|
||||
error: error instanceof Error ? error.message : String(error)
|
||||
}, 'final attempt failed, giving up');
|
||||
throw error;
|
||||
}
|
||||
|
||||
log.warn({
|
||||
operation,
|
||||
attempt,
|
||||
error: error instanceof Error ? error.message : String(error)
|
||||
}, 'attempt failed, will retry');
|
||||
}
|
||||
}
|
||||
|
||||
throw new Error('Unexpected: retry loop completed without success or terminal error');
|
||||
}
|
||||
|
||||
/**
|
||||
* Wait for process completion with robust signal detection.
|
||||
* Replaces scattered completion detection with unified approach.
|
||||
*/
|
||||
private async waitForCompletion(agent: AgentInfo): Promise<CompletionResult> {
|
||||
const agentWorkdir = this.processManager.getAgentWorkdir(agent.worktreeId);
|
||||
|
||||
log.debug({
|
||||
agentId: agent.id,
|
||||
name: agent.name,
|
||||
agentWorkdir
|
||||
}, 'waiting for process completion');
|
||||
|
||||
// Wait for process to exit (this would need integration with ProcessManager)
|
||||
// For now, simulate with a timeout approach
|
||||
// TODO: Implement waitForProcessCompletion in ProcessManager
|
||||
|
||||
// Wait for signal within reasonable timeout (30 seconds)
|
||||
const signal = await this.signalManager.waitForSignal(agentWorkdir, 30000);
|
||||
|
||||
if (signal) {
|
||||
log.debug({
|
||||
agentId: agent.id,
|
||||
signalStatus: signal.status
|
||||
}, 'agent completed with valid signal');
|
||||
return { success: true, signal };
|
||||
}
|
||||
|
||||
// No signal found - this is an error condition
|
||||
log.warn({
|
||||
agentId: agent.id,
|
||||
agentWorkdir
|
||||
}, 'process completed without valid signal.json');
|
||||
|
||||
return {
|
||||
success: false,
|
||||
error: new Error('Process completed without valid signal.json'),
|
||||
exitCode: null // Would get from ProcessManager
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Handle post-completion cleanup based on agent status and debug mode.
|
||||
*/
|
||||
private async handlePostCompletion(agent: AgentInfo): Promise<void> {
|
||||
// Only cleanup if agent is not waiting for user input
|
||||
if (agent.status === 'waiting_for_input') {
|
||||
log.debug({ agentId: agent.id }, 'agent waiting for input, skipping cleanup');
|
||||
return;
|
||||
}
|
||||
|
||||
try {
|
||||
const cleanupAction = await this.cleanupStrategy.shouldCleanup(agent, this.debug);
|
||||
await this.cleanupStrategy.executeCleanup(agent, cleanupAction);
|
||||
|
||||
log.debug({
|
||||
agentId: agent.id,
|
||||
name: agent.name,
|
||||
cleanupAction
|
||||
}, 'post-completion cleanup executed');
|
||||
} catch (error) {
|
||||
log.warn({
|
||||
agentId: agent.id,
|
||||
error: error instanceof Error ? error.message : String(error)
|
||||
}, 'post-completion cleanup failed');
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Persist error details to database for debugging.
|
||||
*/
|
||||
private async persistError(agentId: string, error: AgentError): Promise<void> {
|
||||
try {
|
||||
const errorData = {
|
||||
errorType: error.type,
|
||||
errorMessage: error.message,
|
||||
exitCode: error.exitCode,
|
||||
isTransient: error.isTransient,
|
||||
requiresAccountSwitch: error.requiresAccountSwitch,
|
||||
updatedAt: new Date(),
|
||||
};
|
||||
|
||||
// This would need database schema updates to store error details
|
||||
// For now, just update with basic error info
|
||||
await this.repository.update(agentId, {
|
||||
exitCode: error.exitCode,
|
||||
updatedAt: new Date(),
|
||||
});
|
||||
|
||||
log.debug({
|
||||
agentId,
|
||||
errorType: error.type,
|
||||
exitCode: error.exitCode
|
||||
}, 'error details persisted to database');
|
||||
} catch (dbError) {
|
||||
log.warn({
|
||||
agentId,
|
||||
error: dbError instanceof Error ? dbError.message : String(dbError)
|
||||
}, 'failed to persist error to database');
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Handle account exhaustion by marking account as exhausted.
|
||||
*/
|
||||
private async handleAccountExhaustion(agentId: string): Promise<void> {
|
||||
if (!this.accountRepository) {
|
||||
log.debug({ agentId }, 'no account repository available for exhaustion handling');
|
||||
return;
|
||||
}
|
||||
|
||||
try {
|
||||
const agent = await this.repository.findById(agentId);
|
||||
if (!agent?.accountId) {
|
||||
log.debug({ agentId }, 'agent has no account ID for exhaustion handling');
|
||||
return;
|
||||
}
|
||||
|
||||
// Mark account as exhausted for 1 hour
|
||||
const exhaustedUntil = new Date(Date.now() + 60 * 60 * 1000);
|
||||
await this.accountRepository.markExhausted(agent.accountId, exhaustedUntil);
|
||||
|
||||
log.info({
|
||||
agentId,
|
||||
accountId: agent.accountId,
|
||||
exhaustedUntil
|
||||
}, 'marked account as exhausted due to usage limits');
|
||||
} catch (error) {
|
||||
log.warn({
|
||||
agentId,
|
||||
error: error instanceof Error ? error.message : String(error)
|
||||
}, 'failed to mark account as exhausted');
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Simple delay utility for retry backoff.
|
||||
*/
|
||||
private delay(ms: number): Promise<void> {
|
||||
return new Promise(resolve => setTimeout(resolve, ms));
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert database agent record to AgentInfo.
|
||||
*/
|
||||
private toAgentInfo(agent: any): AgentInfo {
|
||||
return {
|
||||
id: agent.id,
|
||||
name: agent.name,
|
||||
status: agent.status,
|
||||
initiativeId: agent.initiativeId,
|
||||
worktreeId: agent.worktreeId,
|
||||
};
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user